From cda2eaa35948893d70145490d5d6ded546fc3bc6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 10 Nov 2017 16:40:24 +1100
Subject: [PATCH 001/676] KVM: PPC: Book3S HV: Avoid shifts by negative amounts

The kvmppc_hpte_page_shifts function decodes the actual and base page
sizes for a HPTE, returning -1 if it doesn't recognize the page size
encoding.  This then gets used as a shift amount in various places,
which is undefined behaviour.  This was reported by Coverity.

In fact this should never occur, since we should only get HPTEs in the
HPT which have a recognized page size encoding.  The only place where
this might not be true is in the call to kvmppc_actual_pgsz() near the
beginning of kvmppc_do_h_enter(), where we are validating the HPTE
value passed in from the guest.

So to fix this and eliminate the undefined behaviour, we make
kvmppc_hpte_page_shifts return 0 for unrecognized page size encodings,
and make kvmppc_actual_pgsz() detect that case and return 0 for the
page size, which will then cause kvmppc_do_h_enter() to return an
error and refuse to insert any HPTE with an unrecognized page size
encoding.

To ensure that we don't get undefined behaviour in compute_tlbie_rb(),
we take the 4k page size path for any unrecognized page size encoding.
This should never be hit in practice because it is only used on HPTE
values which have previously been checked for having a recognized
page size encoding.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 735cfa35298ac..998f7b7aaa9e5 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
 	lphi = (l >> 16) & 0xf;
 	switch ((l >> 12) & 0xf) {
 	case 0:
-		return !lphi ? 24 : -1;		/* 16MB */
+		return !lphi ? 24 : 0;		/* 16MB */
 		break;
 	case 1:
 		return 16;			/* 64kB */
 		break;
 	case 3:
-		return !lphi ? 34 : -1;		/* 16GB */
+		return !lphi ? 34 : 0;		/* 16GB */
 		break;
 	case 7:
 		return (16 << 8) + 12;		/* 64kB in 4kB */
@@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
 			return (24 << 8) + 12;	/* 16MB in 4kB */
 		break;
 	}
-	return -1;
+	return 0;
 }
 
 static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
@@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l
 
 static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
 {
-	return 1ul << kvmppc_hpte_actual_page_shift(v, r);
+	int shift = kvmppc_hpte_actual_page_shift(v, r);
+
+	if (shift)
+		return 1ul << shift;
+	return 0;
 }
 
 static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
@@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
 
-	if (b_pgshift == 12) {
+	if (b_pgshift <= 12) {
 		if (a_pgshift > 12) {
 			sllp = (a_pgshift == 16) ? 5 : 4;
 			rb |= sllp << 5;	/*  AP field */

From 117647ff936e2d9684cc881d87c0291f46669c20 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 10 Nov 2017 16:43:35 +1100
Subject: [PATCH 002/676] KVM: PPC: Book3S HV: Fix typo in
 kvmppc_hv_get_dirty_log_radix()

This fixes a typo where the intent was to assign to 'j' in order to
skip some number of bits in the dirty bitmap for a guest.  The effect
of the typo is benign since it means we just iterate through all the
bits rather than skipping bits which we know will be zero.  This issue
was found by Coverity.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 58618f644c56b..0c854816e653e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 		j = i + 1;
 		if (npages) {
 			set_dirty_bits(map, i, npages);
-			i = j + npages;
+			j = i + npages;
 		}
 	}
 	return 0;

From 4fcf361dbdbdb43038bb173e2391c4073e713745 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 14:17:53 +1100
Subject: [PATCH 003/676] KVM: PPC: Book3S HV: Remove useless statement

This removes a statement that has no effect.  It should have been
removed in commit 898b25b202f3 ("KVM: PPC: Book3S HV: Simplify dynamic
micro-threading code", 2017-06-22) along with the loop over the
piggy-backed virtual cores.

This issue was reported by Coverity.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2d46037ce9366..597498d6db2e2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2831,7 +2831,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		 */
 		if (!thr0_done)
 			kvmppc_start_thread(NULL, pvc);
-		thr += pvc->num_threads;
 	}
 
 	/*

From c0093f1a38a0fd6c32a2269f0533bb13fb95143d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 16:12:25 +1100
Subject: [PATCH 004/676] KVM: PPC: Book3S HV: Fix conditions for starting vcpu

This corrects the test that determines whether a vcpu that has just
become able to run in the guest (e.g. it has just finished handling
a hypercall or hypervisor page fault) and whose virtual core is
already running somewhere as a "piggybacked" vcore can start
immediately or not.  (A piggybacked vcore is one which is executing
along with another vcore as a result of dynamic micro-threading.)

Previously the test tried to lock the piggybacked vcore using
spin_trylock, which would always fail because the vcore was already
locked, and so the vcpu would have to wait until its vcore exited
the guest before it could enter.

In fact the vcpu can enter if its vcore is in VCORE_PIGGYBACK state
and not already exiting (or exited) the guest, so the test in
VCORE_PIGGYBACK state is basically the same as for VCORE_RUNNING
state.

Coverity detected this as a double unlock issue, which it isn't
because the spin_trylock would always fail.  This will fix the
apparent double unlock as well.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 597498d6db2e2..c4f0bebfc5ba4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3175,17 +3175,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * this thread straight away and have it join in.
 	 */
 	if (!signal_pending(current)) {
-		if (vc->vcore_state == VCORE_PIGGYBACK) {
-			if (spin_trylock(&vc->lock)) {
-				if (vc->vcore_state == VCORE_RUNNING &&
-				    !VCORE_IS_EXITING(vc)) {
-					kvmppc_create_dtl_entry(vcpu, vc);
-					kvmppc_start_thread(vcpu, vc);
-					trace_kvm_guest_enter(vcpu);
-				}
-				spin_unlock(&vc->lock);
-			}
-		} else if (vc->vcore_state == VCORE_RUNNING &&
+		if ((vc->vcore_state == VCORE_PIGGYBACK ||
+		     vc->vcore_state == VCORE_RUNNING) &&
 			   !VCORE_IS_EXITING(vc)) {
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu, vc);

From 9aa6825bbb7526a7fdec137b7cc3b042581cd2fc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 19:56:27 +1100
Subject: [PATCH 005/676] KVM: PPC: Book3S: Eliminate some unnecessary checks

In an excess of caution, commit 6f63e81bda98 ("KVM: PPC: Book3S: Add
MMIO emulation for FP and VSX instructions", 2017-02-21) included
checks for the case that vcpu->arch.mmio_vsx_copy_nums is less than
zero, even though its type is u8.  This causes a Coverity warning,
so we remove the check for < 0.  We also adjust the associated
comment to be more accurate ("4 or less" rather than "less than 4").

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/powerpc.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6b6c53c42ac94..c2c7ef3305532 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1101,11 +1101,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 {
 	enum emulation_result emulated = EMULATE_DONE;
 
-	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
-	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
-		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
 		return EMULATE_FAIL;
-	}
 
 	while (vcpu->arch.mmio_vsx_copy_nums) {
 		emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
@@ -1247,11 +1245,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	vcpu->arch.io_gpr = rs;
 
-	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
-	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
-		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
 		return EMULATE_FAIL;
-	}
 
 	while (vcpu->arch.mmio_vsx_copy_nums) {
 		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)

From b38defdb44fb0377b38896e38ac1fc8482e68f76 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:23 -0600
Subject: [PATCH 006/676] Documentation/virtual/kvm: Add AMD Secure Encrypted
 Virtualization (SEV)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a Documentation entry to describe the AMD Secure Encrypted
Virtualization (SEV) feature.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: kvm@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 Documentation/virtual/kvm/00-INDEX            |  3 ++
 .../virtual/kvm/amd-memory-encryption.rst     | 45 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 Documentation/virtual/kvm/amd-memory-encryption.rst

diff --git a/Documentation/virtual/kvm/00-INDEX b/Documentation/virtual/kvm/00-INDEX
index 69fe1a8b7ad16..3da73aabff5ac 100644
--- a/Documentation/virtual/kvm/00-INDEX
+++ b/Documentation/virtual/kvm/00-INDEX
@@ -26,3 +26,6 @@ s390-diag.txt
 	- Diagnose hypercall description (for IBM S/390)
 timekeeping.txt
 	- timekeeping virtualization for x86-based architectures.
+amd-memory-encryption.txt
+	- notes on AMD Secure Encrypted Virtualization feature and SEV firmware
+	  command description
diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst
new file mode 100644
index 0000000000000..a8ef21e737db6
--- /dev/null
+++ b/Documentation/virtual/kvm/amd-memory-encryption.rst
@@ -0,0 +1,45 @@
+======================================
+Secure Encrypted Virtualization (SEV)
+======================================
+
+Overview
+========
+
+Secure Encrypted Virtualization (SEV) is a feature found on AMD processors.
+
+SEV is an extension to the AMD-V architecture which supports running
+virtual machines (VMs) under the control of a hypervisor. When enabled,
+the memory contents of a VM will be transparently encrypted with a key
+unique to that VM.
+
+The hypervisor can determine the SEV support through the CPUID
+instruction. The CPUID function 0x8000001f reports information related
+to SEV::
+
+	0x8000001f[eax]:
+			Bit[1] 	indicates support for SEV
+	    ...
+		  [ecx]:
+			Bits[31:0]  Number of encrypted guests supported simultaneously
+
+If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015
+(MSR_K7_HWCR) can be used to determine if it can be enabled::
+
+	0xc001_0010:
+		Bit[23]	   1 = memory encryption can be enabled
+			   0 = memory encryption can not be enabled
+
+	0xc001_0015:
+		Bit[0]	   1 = memory encryption can be enabled
+			   0 = memory encryption can not be enabled
+
+When SEV support is available, it can be enabled in a specific VM by
+setting the SEV bit before executing VMRUN.::
+
+	VMCB[0x90]:
+		Bit[1]	    1 = SEV is enabled
+			    0 = SEV is disabled
+
+SEV hardware uses ASIDs to associate a memory encryption key with a VM.
+Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value
+defined in the CPUID 0x8000001f[ecx] field.

From 18c71ce9c8822d48d2b4c50242051535d46082ac Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 4 Dec 2017 10:57:23 -0600
Subject: [PATCH 007/676] x86/CPU/AMD: Add the Secure Encrypted Virtualization
 CPU feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the CPU features to include identifying and reporting on the
Secure Encrypted Virtualization (SEV) feature.  SEV is identified by
CPUID 0x8000001f, but requires BIOS support to enable it (set bit 23 of
MSR_K8_SYSCFG and set bit 0 of MSR_K7_HWCR).  Only show the SEV feature
as available if reported by CPUID and enabled by BIOS.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/msr-index.h   |  2 +
 arch/x86/kernel/cpu/amd.c          | 66 +++++++++++++++++++++---------
 arch/x86/kernel/cpu/scattered.c    |  1 +
 4 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index c0b0e9e8aa66e..19b955adacff2 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -201,6 +201,7 @@
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV			( 7*32+11) /* AMD Secure Encrypted Virtualization */
 
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 34c4922bbc3fb..507d3e30f7fe8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -382,6 +382,8 @@
 #define MSR_K7_PERFCTR3			0xc0010007
 #define MSR_K7_CLK_CTL			0xc001001b
 #define MSR_K7_HWCR			0xc0010015
+#define MSR_K7_HWCR_SMMLOCK_BIT		0
+#define MSR_K7_HWCR_SMMLOCK		BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d58184b7cd443..c1234aa0550ce 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 	}
 }
 
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+	u64 msr;
+
+	/*
+	 * BIOS support is required for SME and SEV.
+	 *   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+	 *	      the SME physical address space reduction value.
+	 *	      If BIOS has not enabled SME then don't advertise the
+	 *	      SME feature (set in scattered.c).
+	 *   For SEV: If BIOS has not enabled SEV then don't advertise the
+	 *            SEV feature (set in scattered.c).
+	 *
+	 *   In all cases, since support for SME and SEV requires long mode,
+	 *   don't advertise the feature under CONFIG_X86_32.
+	 */
+	if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+		/* Check if memory encryption is enabled */
+		rdmsrl(MSR_K8_SYSCFG, msr);
+		if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+			goto clear_all;
+
+		/*
+		 * Always adjust physical address bits. Even though this
+		 * will be a value above 32-bits this is still done for
+		 * CONFIG_X86_32 so that accurate values are reported.
+		 */
+		c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+		if (IS_ENABLED(CONFIG_X86_32))
+			goto clear_all;
+
+		rdmsrl(MSR_K7_HWCR, msr);
+		if (!(msr & MSR_K7_HWCR_SMMLOCK))
+			goto clear_sev;
+
+		return;
+
+clear_all:
+		clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+		clear_cpu_cap(c, X86_FEATURE_SEV);
+	}
+}
+
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
 	u32 dummy;
@@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (cpu_has_amd_erratum(c, amd_erratum_400))
 		set_cpu_bug(c, X86_BUG_AMD_E400);
 
-	/*
-	 * BIOS support is required for SME. If BIOS has enabled SME then
-	 * adjust x86_phys_bits by the SME physical address space reduction
-	 * value. If BIOS has not enabled SME then don't advertise the
-	 * feature (set in scattered.c). Also, since the SME support requires
-	 * long mode, don't advertise the feature under CONFIG_X86_32.
-	 */
-	if (cpu_has(c, X86_FEATURE_SME)) {
-		u64 msr;
-
-		/* Check if SME is enabled */
-		rdmsrl(MSR_K8_SYSCFG, msr);
-		if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
-			c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
-			if (IS_ENABLED(CONFIG_X86_32))
-				clear_cpu_cap(c, X86_FEATURE_SME);
-		} else {
-			clear_cpu_cap(c, X86_FEATURE_SME);
-		}
-	}
+	early_detect_mem_encrypt(c);
 }
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 05459ad3db46e..63a78d5fe505b 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -32,6 +32,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
 	{ X86_FEATURE_SME,		CPUID_EAX,  0, 0x8000001f, 0 },
+	{ X86_FEATURE_SEV,		CPUID_EAX,  1, 0x8000001f, 0 },
 	{ 0, 0, 0, 0, 0 }
 };
 

From cea3a19b007a690a40fac373f22bf34ea69761a2 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 4 Dec 2017 10:57:24 -0600
Subject: [PATCH 008/676] kvm: svm: prepare for new bit definition in
 nested_ctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the nested_ctl variable in the vmcb_control_area structure is
used to indicate nested paging support. The nested paging support field
is actually defined as bit 0 of the field. In order to support a new
feature flag the usage of the nested_ctl and nested paging support must
be converted to operate on a single bit.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/svm.h | 2 ++
 arch/x86/kvm/svm.c         | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 78dd9df881577..c936c9824405c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -146,6 +146,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
 #define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
 
+#define SVM_NESTED_CTL_NP_ENABLE	BIT(0)
+
 struct __attribute__ ((__packed__)) vmcb_seg {
 	u16 selector;
 	u16 attrib;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 59e13a79c2e3e..1c8158a6ee06b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1293,7 +1293,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	if (npt_enabled) {
 		/* Setup VMCB for Nested Paging */
-		control->nested_ctl = 1;
+		control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
 		clr_intercept(svm, INTERCEPT_INVLPG);
 		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -2918,7 +2918,8 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
 	if (vmcb->control.asid == 0)
 		return false;
 
-	if (vmcb->control.nested_ctl && !npt_enabled)
+	if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
+	    !npt_enabled)
 		return false;
 
 	return true;
@@ -2932,7 +2933,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	else
 		svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
 
-	if (nested_vmcb->control.nested_ctl) {
+	if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
 		kvm_mmu_unload(&svm->vcpu);
 		svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
 		nested_svm_init_mmu_context(&svm->vcpu);

From ba7c3398dc7eedbbe9d19a17ce3ab98349079fc7 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 4 Dec 2017 10:57:24 -0600
Subject: [PATCH 009/676] kvm: svm: Add SEV feature definitions to KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define the SEV enable bit for the VMCB control structure. The hypervisor
will use this bit to enable SEV in the guest.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/svm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index c936c9824405c..0487ac0548704 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -147,6 +147,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
 
 #define SVM_NESTED_CTL_NP_ENABLE	BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE	BIT(1)
 
 struct __attribute__ ((__packed__)) vmcb_seg {
 	u16 selector;

From 4faefff3241e02dd66bfb77bd7ca40f3aa08f62e Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:25 -0600
Subject: [PATCH 010/676] KVM: SVM: Prepare to reserve asid for SEV guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, ASID allocation start at 1. Add a svm_vcpu_data.min_asid
which allows supplying a dynamic start ASID.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1c8158a6ee06b..e403cf1f6ba3d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -525,6 +525,7 @@ struct svm_cpu_data {
 	u64 asid_generation;
 	u32 max_asid;
 	u32 next_asid;
+	u32 min_asid;
 	struct kvm_ldttss_desc *tss_desc;
 
 	struct page *save_area;
@@ -782,6 +783,7 @@ static int svm_hardware_enable(void)
 	sd->asid_generation = 1;
 	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	sd->next_asid = sd->max_asid + 1;
+	sd->min_asid = 1;
 
 	gdt = get_current_gdt_rw();
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
@@ -2088,7 +2090,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 {
 	if (sd->next_asid > sd->max_asid) {
 		++sd->asid_generation;
-		sd->next_asid = 1;
+		sd->next_asid = sd->min_asid;
 		svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
 	}
 

From 8765d75329a386dd7742f94a1ea5fdcdea8d93d0 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:25 -0600
Subject: [PATCH 011/676] KVM: X86: Extend CPUID range to include new leaf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This CPUID leaf provides the memory encryption support information on
AMD Platform. Its complete description is available in APM volume 2,
Section 15.34

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 arch/x86/kvm/svm.c   | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0099e10eb0452..c6473ca825cdd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -604,7 +604,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->edx = 0;
 		break;
 	case 0x80000000:
-		entry->eax = min(entry->eax, 0x8000001a);
+		entry->eax = min(entry->eax, 0x8000001f);
 		break;
 	case 0x80000001:
 		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e403cf1f6ba3d..7e302a25bc455 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5170,6 +5170,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 			entry->edx |= SVM_FEATURE_NPT;
 
 		break;
+	case 0x8000001F:
+		/* Support memory encryption cpuid if host supports it */
+		if (boot_cpu_has(X86_FEATURE_SEV))
+			cpuid(0x8000001f, &entry->eax, &entry->ebx,
+				&entry->ecx, &entry->edx);
+
 	}
 }
 

From 5acc5c063196b4a531a761a954023c1848ec832b Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:26 -0600
Subject: [PATCH 012/676] KVM: Introduce KVM_MEMORY_ENCRYPT_OP ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the hardware supports memory encryption then the
KVM_MEMORY_ENCRYPT_OP ioctl can be used by qemu to issue a platform
specific memory encryption commands.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 Documentation/virtual/kvm/api.txt | 16 ++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                |  6 ++++++
 include/uapi/linux/kvm.h          |  2 ++
 4 files changed, 26 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f33..c8755be35543a 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3394,6 +3394,22 @@ invalid, if invalid pages are written to (e.g. after the end of memory)
 or if no page table is present for the addresses (e.g. when using
 hugepages).
 
+4.109 KVM_MEMORY_ENCRYPT_OP
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: an opaque platform specific structure (in/out)
+Returns: 0 on success; -1 on error
+
+If the platform supports creating encrypted VMs then this ioctl can be used
+for issuing platform-specific memory encryption commands to manage those
+encrypted VMs.
+
+Currently, this ioctl is used for issuing Secure Encrypted Virtualization
+(SEV) commands on AMD Processors. The SEV commands are defined in
+Documentation/virtual/kvm/amd-memory-encryption.txt.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1bfb99770c341..c87e214d55dfd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1066,6 +1066,8 @@ struct kvm_x86_ops {
 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
+
+	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 34c85aa2e2d1d..7bbed0c0ba799 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4281,6 +4281,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
 		break;
 	}
+	case KVM_MEMORY_ENCRYPT_OP: {
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_op)
+			r = kvm_x86_ops->mem_enc_op(kvm, argp);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 282d7613fce87..addd0cf4445fa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1358,6 +1358,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_CMMA_MIGRATION */
 #define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
 #define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Memory Encryption Commands */
+#define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)

From 69eaedee411c1fc1cf123520897a96b7cf04d8a0 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:26 -0600
Subject: [PATCH 013/676] KVM: Introduce KVM_MEMORY_ENCRYPT_{UN,}REG_REGION
 ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If hardware supports memory encryption then KVM_MEMORY_ENCRYPT_REG_REGION
and KVM_MEMORY_ENCRYPT_UNREG_REGION ioctl's can be used by userspace to
register/unregister the guest memory regions which may contain the encrypted
data (e.g guest RAM, PCI BAR, SMRAM etc).

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 Documentation/virtual/kvm/api.txt | 34 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/x86.c                | 24 ++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  8 ++++++++
 4 files changed, 68 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c8755be35543a..c2ced6a44bbb4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3410,6 +3410,40 @@ Currently, this ioctl is used for issuing Secure Encrypted Virtualization
 (SEV) commands on AMD Processors. The SEV commands are defined in
 Documentation/virtual/kvm/amd-memory-encryption.txt.
 
+4.110 KVM_MEMORY_ENCRYPT_REG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to register a guest memory region which may
+contain encrypted data (e.g. guest RAM, SMRAM etc).
+
+It is used in the SEV-enabled guest. When encryption is enabled, a guest
+memory region may contain encrypted data. The SEV memory encryption
+engine uses a tweak such that two identical plaintext pages, each at
+different locations will have differing ciphertexts. So swapping or
+moving ciphertext of those pages will not result in plaintext being
+swapped. So relocating (or migrating) physical backing pages for the SEV
+guest will require some additional steps.
+
+Note: The current SEV key management spec does not provide commands to
+swap or migrate (move) ciphertext pages. Hence, for now we pin the guest
+memory region registered with the ioctl.
+
+4.111 KVM_MEMORY_ENCRYPT_UNREG_REGION
+
+Capability: basic
+Architectures: x86
+Type: system
+Parameters: struct kvm_enc_region (in)
+Returns: 0 on success; -1 on error
+
+This ioctl can be used to unregister the guest memory region registered
+with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c87e214d55dfd..58b7cc30466b4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1068,6 +1068,8 @@ struct kvm_x86_ops {
 	int (*enable_smi_window)(struct kvm_vcpu *vcpu);
 
 	int (*mem_enc_op)(struct kvm *kvm, void __user *argp);
+	int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
+	int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7bbed0c0ba799..926f55cecf2e3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4287,6 +4287,30 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			r = kvm_x86_ops->mem_enc_op(kvm, argp);
 		break;
 	}
+	case KVM_MEMORY_ENCRYPT_REG_REGION: {
+		struct kvm_enc_region region;
+
+		r = -EFAULT;
+		if (copy_from_user(&region, argp, sizeof(region)))
+			goto out;
+
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_reg_region)
+			r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
+		break;
+	}
+	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
+		struct kvm_enc_region region;
+
+		r = -EFAULT;
+		if (copy_from_user(&region, argp, sizeof(region)))
+			goto out;
+
+		r = -ENOTTY;
+		if (kvm_x86_ops->mem_enc_unreg_region)
+			r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index addd0cf4445fa..c8c65190907d6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1361,6 +1361,14 @@ struct kvm_s390_ucas_mapping {
 /* Memory Encryption Commands */
 #define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
 
+struct kvm_enc_region {
+	__u64 addr;
+	__u64 size;
+};
+
+#define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
+#define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)

From 016db9c5c39fc7f1c766daf2083211318ef93d38 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 4 Dec 2017 10:57:26 -0600
Subject: [PATCH 014/676] crypto: ccp: Build the AMD secure processor driver
 only with AMD CPU support

This is AMD-specific hardware so present it in Kconfig only when AMD
CPU support is enabled or on ARM64 where it is also used.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Gary R Hook <gary.hook@amd.com>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-crypto@vger.kernel.org
---
 drivers/crypto/ccp/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig
index 6d626606b9c51..9c84f98389319 100644
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
@@ -1,5 +1,6 @@
 config CRYPTO_DEV_CCP_DD
 	tristate "Secure Processor device driver"
+	depends on CPU_SUP_AMD || ARM64
 	default m
 	help
 	  Provides AMD Secure Processor device driver.

From 1d57b17c60ff245b8e50813bf0fd24143ecf26d4 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:27 -0600
Subject: [PATCH 015/676] crypto: ccp: Define SEV userspace ioctl and command
 id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a include file which defines the ioctl and command id used for
issuing SEV platform management specific commands.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 include/uapi/linux/psp-sev.h | 142 +++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 include/uapi/linux/psp-sev.h

diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
new file mode 100644
index 0000000000000..3d77fe91239a8
--- /dev/null
+++ b/include/uapi/linux/psp-sev.h
@@ -0,0 +1,142 @@
+/*
+ * Userspace interface for AMD Secure Encrypted Virtualization (SEV)
+ * platform management commands.
+ *
+ * Copyright (C) 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * SEV spec 0.14 is available at:
+ * http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __PSP_SEV_USER_H__
+#define __PSP_SEV_USER_H__
+
+#include <linux/types.h>
+
+/**
+ * SEV platform commands
+ */
+enum {
+	SEV_FACTORY_RESET = 0,
+	SEV_PLATFORM_STATUS,
+	SEV_PEK_GEN,
+	SEV_PEK_CSR,
+	SEV_PDH_GEN,
+	SEV_PDH_CERT_EXPORT,
+	SEV_PEK_CERT_IMPORT,
+
+	SEV_MAX,
+};
+
+/**
+ * SEV Firmware status code
+ */
+typedef enum {
+	SEV_RET_SUCCESS = 0,
+	SEV_RET_INVALID_PLATFORM_STATE,
+	SEV_RET_INVALID_GUEST_STATE,
+	SEV_RET_INAVLID_CONFIG,
+	SEV_RET_INVALID_len,
+	SEV_RET_ALREADY_OWNED,
+	SEV_RET_INVALID_CERTIFICATE,
+	SEV_RET_POLICY_FAILURE,
+	SEV_RET_INACTIVE,
+	SEV_RET_INVALID_ADDRESS,
+	SEV_RET_BAD_SIGNATURE,
+	SEV_RET_BAD_MEASUREMENT,
+	SEV_RET_ASID_OWNED,
+	SEV_RET_INVALID_ASID,
+	SEV_RET_WBINVD_REQUIRED,
+	SEV_RET_DFFLUSH_REQUIRED,
+	SEV_RET_INVALID_GUEST,
+	SEV_RET_INVALID_COMMAND,
+	SEV_RET_ACTIVE,
+	SEV_RET_HWSEV_RET_PLATFORM,
+	SEV_RET_HWSEV_RET_UNSAFE,
+	SEV_RET_UNSUPPORTED,
+	SEV_RET_MAX,
+} sev_ret_code;
+
+/**
+ * struct sev_user_data_status - PLATFORM_STATUS command parameters
+ *
+ * @major: major API version
+ * @minor: minor API version
+ * @state: platform state
+ * @flags: platform config flags
+ * @build: firmware build id for API version
+ * @guest_count: number of active guests
+ */
+struct sev_user_data_status {
+	__u8 api_major;				/* Out */
+	__u8 api_minor;				/* Out */
+	__u8 state;				/* Out */
+	__u32 flags;				/* Out */
+	__u8 build;				/* Out */
+	__u32 guest_count;			/* Out */
+} __packed;
+
+/**
+ * struct sev_user_data_pek_csr - PEK_CSR command parameters
+ *
+ * @address: PEK certificate chain
+ * @length: length of certificate
+ */
+struct sev_user_data_pek_csr {
+	__u64 address;				/* In */
+	__u32 length;				/* In/Out */
+} __packed;
+
+/**
+ * struct sev_user_data_cert_import - PEK_CERT_IMPORT command parameters
+ *
+ * @pek_address: PEK certificate chain
+ * @pek_len: length of PEK certificate
+ * @oca_address: OCA certificate chain
+ * @oca_len: length of OCA certificate
+ */
+struct sev_user_data_pek_cert_import {
+	__u64 pek_cert_address;			/* In */
+	__u32 pek_cert_len;			/* In */
+	__u64 oca_cert_address;			/* In */
+	__u32 oca_cert_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_user_data_pdh_cert_export - PDH_CERT_EXPORT command parameters
+ *
+ * @pdh_address: PDH certificate address
+ * @pdh_len: length of PDH certificate
+ * @cert_chain_address: PDH certificate chain
+ * @cert_chain_len: length of PDH certificate chain
+ */
+struct sev_user_data_pdh_cert_export {
+	__u64 pdh_cert_address;			/* In */
+	__u32 pdh_cert_len;			/* In/Out */
+	__u64 cert_chain_address;		/* In */
+	__u32 cert_chain_len;			/* In/Out */
+} __packed;
+
+/**
+ * struct sev_issue_cmd - SEV ioctl parameters
+ *
+ * @cmd: SEV commands to execute
+ * @opaque: pointer to the command structure
+ * @error: SEV FW return code on failure
+ */
+struct sev_issue_cmd {
+	__u32 cmd;				/* In */
+	__u64 data;				/* In */
+	__u32 error;				/* Out */
+} __packed;
+
+#define SEV_IOC_TYPE		'S'
+#define SEV_ISSUE_CMD	_IOWR(SEV_IOC_TYPE, 0x0, struct sev_issue_cmd)
+
+#endif /* __PSP_USER_SEV_H */

From 592d5e74ddb28f66c5f4acffcd36156b1621a7c4 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:27 -0600
Subject: [PATCH 016/676] crypto: ccp: Define SEV key management command id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define Secure Encrypted Virtualization (SEV) key management command id
and structure. The command definition is available in SEV KM spec
0.14 (http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf)

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 include/linux/psp-sev.h | 465 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 465 insertions(+)
 create mode 100644 include/linux/psp-sev.h

diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
new file mode 100644
index 0000000000000..4a150d17d537e
--- /dev/null
+++ b/include/linux/psp-sev.h
@@ -0,0 +1,465 @@
+/*
+ * AMD Secure Encrypted Virtualization (SEV) driver interface
+ *
+ * Copyright (C) 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * SEV spec 0.14 is available at:
+ * http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __PSP_SEV_H__
+#define __PSP_SEV_H__
+
+#include <uapi/linux/psp-sev.h>
+
+#ifdef CONFIG_X86
+#include <linux/mem_encrypt.h>
+
+#define __psp_pa(x)	__sme_pa(x)
+#else
+#define __psp_pa(x)	__pa(x)
+#endif
+
+#define SEV_FW_BLOB_MAX_SIZE	0x4000	/* 16KB */
+
+/**
+ * SEV platform state
+ */
+enum sev_state {
+	SEV_STATE_UNINIT		= 0x0,
+	SEV_STATE_INIT			= 0x1,
+	SEV_STATE_WORKING		= 0x2,
+
+	SEV_STATE_MAX
+};
+
+/**
+ * SEV platform and guest management commands
+ */
+enum sev_cmd {
+	/* platform commands */
+	SEV_CMD_INIT			= 0x001,
+	SEV_CMD_SHUTDOWN		= 0x002,
+	SEV_CMD_FACTORY_RESET		= 0x003,
+	SEV_CMD_PLATFORM_STATUS		= 0x004,
+	SEV_CMD_PEK_GEN			= 0x005,
+	SEV_CMD_PEK_CSR			= 0x006,
+	SEV_CMD_PEK_CERT_IMPORT		= 0x007,
+	SEV_CMD_PDH_CERT_EXPORT		= 0x008,
+	SEV_CMD_PDH_GEN			= 0x009,
+	SEV_CMD_DF_FLUSH		= 0x00A,
+
+	/* Guest commands */
+	SEV_CMD_DECOMMISSION		= 0x020,
+	SEV_CMD_ACTIVATE		= 0x021,
+	SEV_CMD_DEACTIVATE		= 0x022,
+	SEV_CMD_GUEST_STATUS		= 0x023,
+
+	/* Guest launch commands */
+	SEV_CMD_LAUNCH_START		= 0x030,
+	SEV_CMD_LAUNCH_UPDATE_DATA	= 0x031,
+	SEV_CMD_LAUNCH_UPDATE_VMSA	= 0x032,
+	SEV_CMD_LAUNCH_MEASURE		= 0x033,
+	SEV_CMD_LAUNCH_UPDATE_SECRET	= 0x034,
+	SEV_CMD_LAUNCH_FINISH		= 0x035,
+
+	/* Guest migration commands (outgoing) */
+	SEV_CMD_SEND_START		= 0x040,
+	SEV_CMD_SEND_UPDATE_DATA	= 0x041,
+	SEV_CMD_SEND_UPDATE_VMSA	= 0x042,
+	SEV_CMD_SEND_FINISH		= 0x043,
+
+	/* Guest migration commands (incoming) */
+	SEV_CMD_RECEIVE_START		= 0x050,
+	SEV_CMD_RECEIVE_UPDATE_DATA	= 0x051,
+	SEV_CMD_RECEIVE_UPDATE_VMSA	= 0x052,
+	SEV_CMD_RECEIVE_FINISH		= 0x053,
+
+	/* Guest debug commands */
+	SEV_CMD_DBG_DECRYPT		= 0x060,
+	SEV_CMD_DBG_ENCRYPT		= 0x061,
+
+	SEV_CMD_MAX,
+};
+
+/**
+ * struct sev_data_init - INIT command parameters
+ *
+ * @flags: processing flags
+ * @tmr_address: system physical address used for SEV-ES
+ * @tmr_len: len of tmr_address
+ */
+struct sev_data_init {
+	u32 flags;			/* In */
+	u32 reserved;			/* In */
+	u64 tmr_address;		/* In */
+	u32 tmr_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_data_pek_csr - PEK_CSR command parameters
+ *
+ * @address: PEK certificate chain
+ * @len: len of certificate
+ */
+struct sev_data_pek_csr {
+	u64 address;				/* In */
+	u32 len;				/* In/Out */
+} __packed;
+
+/**
+ * struct sev_data_cert_import - PEK_CERT_IMPORT command parameters
+ *
+ * @pek_address: PEK certificate chain
+ * @pek_len: len of PEK certificate
+ * @oca_address: OCA certificate chain
+ * @oca_len: len of OCA certificate
+ */
+struct sev_data_pek_cert_import {
+	u64 pek_cert_address;			/* In */
+	u32 pek_cert_len;			/* In */
+	u32 reserved;				/* In */
+	u64 oca_cert_address;			/* In */
+	u32 oca_cert_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_data_pdh_cert_export - PDH_CERT_EXPORT command parameters
+ *
+ * @pdh_address: PDH certificate address
+ * @pdh_len: len of PDH certificate
+ * @cert_chain_address: PDH certificate chain
+ * @cert_chain_len: len of PDH certificate chain
+ */
+struct sev_data_pdh_cert_export {
+	u64 pdh_cert_address;			/* In */
+	u32 pdh_cert_len;			/* In/Out */
+	u32 reserved;				/* In */
+	u64 cert_chain_address;			/* In */
+	u32 cert_chain_len;			/* In/Out */
+} __packed;
+
+/**
+ * struct sev_data_decommission - DECOMMISSION command parameters
+ *
+ * @handle: handle of the VM to decommission
+ */
+struct sev_data_decommission {
+	u32 handle;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_activate - ACTIVATE command parameters
+ *
+ * @handle: handle of the VM to activate
+ * @asid: asid assigned to the VM
+ */
+struct sev_data_activate {
+	u32 handle;				/* In */
+	u32 asid;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_deactivate - DEACTIVATE command parameters
+ *
+ * @handle: handle of the VM to deactivate
+ */
+struct sev_data_deactivate {
+	u32 handle;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_guest_status - SEV GUEST_STATUS command parameters
+ *
+ * @handle: handle of the VM to retrieve status
+ * @policy: policy information for the VM
+ * @asid: current ASID of the VM
+ * @state: current state of the VM
+ */
+struct sev_data_guest_status {
+	u32 handle;				/* In */
+	u32 policy;				/* Out */
+	u32 asid;				/* Out */
+	u8 state;				/* Out */
+} __packed;
+
+/**
+ * struct sev_data_launch_start - LAUNCH_START command parameters
+ *
+ * @handle: handle assigned to the VM
+ * @policy: guest launch policy
+ * @dh_cert_address: physical address of DH certificate blob
+ * @dh_cert_len: len of DH certificate blob
+ * @session_address: physical address of session parameters
+ * @session_len: len of session parameters
+ */
+struct sev_data_launch_start {
+	u32 handle;				/* In/Out */
+	u32 policy;				/* In */
+	u64 dh_cert_address;			/* In */
+	u32 dh_cert_len;			/* In */
+	u32 reserved;				/* In */
+	u64 session_address;			/* In */
+	u32 session_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_data_launch_update_data - LAUNCH_UPDATE_DATA command parameter
+ *
+ * @handle: handle of the VM to update
+ * @len: len of memory to be encrypted
+ * @address: physical address of memory region to encrypt
+ */
+struct sev_data_launch_update_data {
+	u32 handle;				/* In */
+	u32 reserved;
+	u64 address;				/* In */
+	u32 len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_launch_update_vmsa - LAUNCH_UPDATE_VMSA command
+ *
+ * @handle: handle of the VM
+ * @address: physical address of memory region to encrypt
+ * @len: len of memory region to encrypt
+ */
+struct sev_data_launch_update_vmsa {
+	u32 handle;				/* In */
+	u32 reserved;
+	u64 address;				/* In */
+	u32 len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_launch_measure - LAUNCH_MEASURE command parameters
+ *
+ * @handle: handle of the VM to process
+ * @address: physical address containing the measurement blob
+ * @len: len of measurement blob
+ */
+struct sev_data_launch_measure {
+	u32 handle;				/* In */
+	u32 reserved;
+	u64 address;				/* In */
+	u32 len;				/* In/Out */
+} __packed;
+
+/**
+ * struct sev_data_launch_secret - LAUNCH_SECRET command parameters
+ *
+ * @handle: handle of the VM to process
+ * @hdr_address: physical address containing the packet header
+ * @hdr_len: len of packet header
+ * @guest_address: system physical address of guest memory region
+ * @guest_len: len of guest_paddr
+ * @trans_address: physical address of transport memory buffer
+ * @trans_len: len of transport memory buffer
+ */
+struct sev_data_launch_secret {
+	u32 handle;				/* In */
+	u32 reserved1;
+	u64 hdr_address;			/* In */
+	u32 hdr_len;				/* In */
+	u32 reserved2;
+	u64 guest_address;			/* In */
+	u32 guest_len;				/* In */
+	u32 reserved3;
+	u64 trans_address;			/* In */
+	u32 trans_len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_launch_finish - LAUNCH_FINISH command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_launch_finish {
+	u32 handle;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_send_start - SEND_START command parameters
+ *
+ * @handle: handle of the VM to process
+ * @policy: policy information for the VM
+ * @pdh_cert_address: physical address containing PDH certificate
+ * @pdh_cert_len: len of PDH certificate
+ * @plat_certs_address: physical address containing platform certificate
+ * @plat_certs_len: len of platform certificate
+ * @amd_certs_address: physical address containing AMD certificate
+ * @amd_certs_len: len of AMD certificate
+ * @session_address: physical address containing Session data
+ * @session_len: len of session data
+ */
+struct sev_data_send_start {
+	u32 handle;				/* In */
+	u32 policy;				/* Out */
+	u64 pdh_cert_address;			/* In */
+	u32 pdh_cert_len;			/* In */
+	u32 reserved1;
+	u64 plat_cert_address;			/* In */
+	u32 plat_cert_len;			/* In */
+	u32 reserved2;
+	u64 amd_cert_address;			/* In */
+	u32 amd_cert_len;			/* In */
+	u32 reserved3;
+	u64 session_address;			/* In */
+	u32 session_len;			/* In/Out */
+} __packed;
+
+/**
+ * struct sev_data_send_update - SEND_UPDATE_DATA command
+ *
+ * @handle: handle of the VM to process
+ * @hdr_address: physical address containing packet header
+ * @hdr_len: len of packet header
+ * @guest_address: physical address of guest memory region to send
+ * @guest_len: len of guest memory region to send
+ * @trans_address: physical address of host memory region
+ * @trans_len: len of host memory region
+ */
+struct sev_data_send_update_data {
+	u32 handle;				/* In */
+	u32 reserved1;
+	u64 hdr_address;			/* In */
+	u32 hdr_len;				/* In/Out */
+	u32 reserved2;
+	u64 guest_address;			/* In */
+	u32 guest_len;				/* In */
+	u32 reserved3;
+	u64 trans_address;			/* In */
+	u32 trans_len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_send_update - SEND_UPDATE_VMSA command
+ *
+ * @handle: handle of the VM to process
+ * @hdr_address: physical address containing packet header
+ * @hdr_len: len of packet header
+ * @guest_address: physical address of guest memory region to send
+ * @guest_len: len of guest memory region to send
+ * @trans_address: physical address of host memory region
+ * @trans_len: len of host memory region
+ */
+struct sev_data_send_update_vmsa {
+	u32 handle;				/* In */
+	u64 hdr_address;			/* In */
+	u32 hdr_len;				/* In/Out */
+	u32 reserved2;
+	u64 guest_address;			/* In */
+	u32 guest_len;				/* In */
+	u32 reserved3;
+	u64 trans_address;			/* In */
+	u32 trans_len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_send_finish - SEND_FINISH command parameters
+ *
+ * @handle: handle of the VM to process
+ */
+struct sev_data_send_finish {
+	u32 handle;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_receive_start - RECEIVE_START command parameters
+ *
+ * @handle: handle of the VM to perform receive operation
+ * @pdh_cert_address: system physical address containing PDH certificate blob
+ * @pdh_cert_len: len of PDH certificate blob
+ * @session_address: system physical address containing session blob
+ * @session_len: len of session blob
+ */
+struct sev_data_receive_start {
+	u32 handle;				/* In/Out */
+	u32 policy;				/* In */
+	u64 pdh_cert_address;			/* In */
+	u32 pdh_cert_len;			/* In */
+	u32 reserved1;
+	u64 session_address;			/* In */
+	u32 session_len;			/* In */
+} __packed;
+
+/**
+ * struct sev_data_receive_update_data - RECEIVE_UPDATE_DATA command parameters
+ *
+ * @handle: handle of the VM to update
+ * @hdr_address: physical address containing packet header blob
+ * @hdr_len: len of packet header
+ * @guest_address: system physical address of guest memory region
+ * @guest_len: len of guest memory region
+ * @trans_address: system physical address of transport buffer
+ * @trans_len: len of transport buffer
+ */
+struct sev_data_receive_update_data {
+	u32 handle;				/* In */
+	u32 reserved1;
+	u64 hdr_address;			/* In */
+	u32 hdr_len;				/* In */
+	u32 reserved2;
+	u64 guest_address;			/* In */
+	u32 guest_len;				/* In */
+	u32 reserved3;
+	u64 trans_address;			/* In */
+	u32 trans_len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_receive_update_vmsa - RECEIVE_UPDATE_VMSA command parameters
+ *
+ * @handle: handle of the VM to update
+ * @hdr_address: physical address containing packet header blob
+ * @hdr_len: len of packet header
+ * @guest_address: system physical address of guest memory region
+ * @guest_len: len of guest memory region
+ * @trans_address: system physical address of transport buffer
+ * @trans_len: len of transport buffer
+ */
+struct sev_data_receive_update_vmsa {
+	u32 handle;				/* In */
+	u32 reserved1;
+	u64 hdr_address;			/* In */
+	u32 hdr_len;				/* In */
+	u32 reserved2;
+	u64 guest_address;			/* In */
+	u32 guest_len;				/* In */
+	u32 reserved3;
+	u64 trans_address;			/* In */
+	u32 trans_len;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_receive_finish - RECEIVE_FINISH command parameters
+ *
+ * @handle: handle of the VM to finish
+ */
+struct sev_data_receive_finish {
+	u32 handle;				/* In */
+} __packed;
+
+/**
+ * struct sev_data_dbg - DBG_ENCRYPT/DBG_DECRYPT command parameters
+ *
+ * @handle: handle of the VM to perform debug operation
+ * @src_addr: source address of data to operate on
+ * @dst_addr: destination address of data to operate on
+ * @len: len of data to operate on
+ */
+struct sev_data_dbg {
+	u32 handle;				/* In */
+	u32 reserved;
+	u64 src_addr;				/* In */
+	u64 dst_addr;				/* In */
+	u32 len;				/* In */
+} __packed;
+
+#endif	/* __PSP_SEV_H__ */

From 2a6170dfe755b167ca8d6bba2e73695f08b37c54 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:28 -0600
Subject: [PATCH 017/676] crypto: ccp: Add Platform Security Processor (PSP)
 device support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Platform Security Processor (PSP) is part of the AMD Secure
Processor (AMD-SP) functionality. The PSP is a dedicated processor
that provides support for key management commands in Secure Encrypted
Virtualization (SEV) mode, along with software-based Trusted Execution
Environment (TEE) to enable third-party trusted applications.

Note that the key management functionality provided by the SEV firmware
can be used outside of the kvm-amd driver hence it doesn't need to
depend on CONFIG_KVM_AMD.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 drivers/crypto/ccp/Kconfig   |  11 ++++
 drivers/crypto/ccp/Makefile  |   1 +
 drivers/crypto/ccp/psp-dev.c | 105 +++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/psp-dev.h |  59 ++++++++++++++++++++
 drivers/crypto/ccp/sp-dev.c  |  26 +++++++++
 drivers/crypto/ccp/sp-dev.h  |  24 +++++++-
 drivers/crypto/ccp/sp-pci.c  |  52 +++++++++++++++++
 7 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 drivers/crypto/ccp/psp-dev.c
 create mode 100644 drivers/crypto/ccp/psp-dev.h

diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig
index 9c84f98389319..b9dfae47aefd8 100644
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
@@ -33,3 +33,14 @@ config CRYPTO_DEV_CCP_CRYPTO
 	  Support for using the cryptographic API with the AMD Cryptographic
 	  Coprocessor. This module supports offload of SHA and AES algorithms.
 	  If you choose 'M' here, this module will be called ccp_crypto.
+
+config CRYPTO_DEV_SP_PSP
+	bool "Platform Security Processor (PSP) device"
+	default y
+	depends on CRYPTO_DEV_CCP_DD && X86_64
+	help
+	 Provide support for the AMD Platform Security Processor (PSP).
+	 The PSP is a dedicated processor that provides support for key
+	 management commands in Secure Encrypted Virtualization (SEV) mode,
+	 along with software-based Trusted Execution Environment (TEE) to
+	 enable third-party trusted applications.
diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index c4ce726b931e3..51d1c0cf66c73 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -8,6 +8,7 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_CCP) += ccp-dev.o \
 	    ccp-dmaengine.o \
 	    ccp-debugfs.o
 ccp-$(CONFIG_PCI) += sp-pci.o
+ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o
 
 obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o
 ccp-crypto-objs := ccp-crypto-main.o \
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
new file mode 100644
index 0000000000000..b5789f8785603
--- /dev/null
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -0,0 +1,105 @@
+/*
+ * AMD Platform Security Processor (PSP) interface
+ *
+ * Copyright (C) 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/hw_random.h>
+#include <linux/ccp.h>
+
+#include "sp-dev.h"
+#include "psp-dev.h"
+
+static struct psp_device *psp_alloc_struct(struct sp_device *sp)
+{
+	struct device *dev = sp->dev;
+	struct psp_device *psp;
+
+	psp = devm_kzalloc(dev, sizeof(*psp), GFP_KERNEL);
+	if (!psp)
+		return NULL;
+
+	psp->dev = dev;
+	psp->sp = sp;
+
+	snprintf(psp->name, sizeof(psp->name), "psp-%u", sp->ord);
+
+	return psp;
+}
+
+static irqreturn_t psp_irq_handler(int irq, void *data)
+{
+	return IRQ_HANDLED;
+}
+
+int psp_dev_init(struct sp_device *sp)
+{
+	struct device *dev = sp->dev;
+	struct psp_device *psp;
+	int ret;
+
+	ret = -ENOMEM;
+	psp = psp_alloc_struct(sp);
+	if (!psp)
+		goto e_err;
+
+	sp->psp_data = psp;
+
+	psp->vdata = (struct psp_vdata *)sp->dev_vdata->psp_vdata;
+	if (!psp->vdata) {
+		ret = -ENODEV;
+		dev_err(dev, "missing driver data\n");
+		goto e_err;
+	}
+
+	psp->io_regs = sp->io_map + psp->vdata->offset;
+
+	/* Disable and clear interrupts until ready */
+	iowrite32(0, psp->io_regs + PSP_P2CMSG_INTEN);
+	iowrite32(-1, psp->io_regs + PSP_P2CMSG_INTSTS);
+
+	/* Request an irq */
+	ret = sp_request_psp_irq(psp->sp, psp_irq_handler, psp->name, psp);
+	if (ret) {
+		dev_err(dev, "psp: unable to allocate an IRQ\n");
+		goto e_err;
+	}
+
+	if (sp->set_psp_master_device)
+		sp->set_psp_master_device(sp);
+
+	/* Enable interrupt */
+	iowrite32(-1, psp->io_regs + PSP_P2CMSG_INTEN);
+
+	return 0;
+
+e_err:
+	sp->psp_data = NULL;
+
+	dev_notice(dev, "psp initialization failed\n");
+
+	return ret;
+}
+
+void psp_dev_destroy(struct sp_device *sp)
+{
+	struct psp_device *psp = sp->psp_data;
+
+	sp_free_psp_irq(sp, psp);
+}
diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
new file mode 100644
index 0000000000000..55b7808367c3a
--- /dev/null
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -0,0 +1,59 @@
+/*
+ * AMD Platform Security Processor (PSP) interface driver
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Brijesh Singh <brijesh.singh@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __PSP_DEV_H__
+#define __PSP_DEV_H__
+
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <linux/dmapool.h>
+#include <linux/hw_random.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/irqreturn.h>
+#include <linux/dmaengine.h>
+
+#include "sp-dev.h"
+
+#define PSP_P2CMSG_INTEN		0x0110
+#define PSP_P2CMSG_INTSTS		0x0114
+
+#define PSP_C2PMSG_ATTR_0		0x0118
+#define PSP_C2PMSG_ATTR_1		0x011c
+#define PSP_C2PMSG_ATTR_2		0x0120
+#define PSP_C2PMSG_ATTR_3		0x0124
+#define PSP_P2CMSG_ATTR_0		0x0128
+
+#define PSP_CMDRESP_CMD_SHIFT		16
+#define PSP_CMDRESP_IOC			BIT(0)
+#define PSP_CMDRESP_RESP		BIT(31)
+#define PSP_CMDRESP_ERR_MASK		0xffff
+
+#define MAX_PSP_NAME_LEN		16
+
+struct psp_device {
+	struct list_head entry;
+
+	struct psp_vdata *vdata;
+	char name[MAX_PSP_NAME_LEN];
+
+	struct device *dev;
+	struct sp_device *sp;
+
+	void __iomem *io_regs;
+};
+
+#endif /* __PSP_DEV_H */
diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c
index bef387c8abfd7..cf101c039c8ff 100644
--- a/drivers/crypto/ccp/sp-dev.c
+++ b/drivers/crypto/ccp/sp-dev.c
@@ -198,6 +198,8 @@ int sp_init(struct sp_device *sp)
 	if (sp->dev_vdata->ccp_vdata)
 		ccp_dev_init(sp);
 
+	if (sp->dev_vdata->psp_vdata)
+		psp_dev_init(sp);
 	return 0;
 }
 
@@ -206,6 +208,9 @@ void sp_destroy(struct sp_device *sp)
 	if (sp->dev_vdata->ccp_vdata)
 		ccp_dev_destroy(sp);
 
+	if (sp->dev_vdata->psp_vdata)
+		psp_dev_destroy(sp);
+
 	sp_del_device(sp);
 }
 
@@ -237,6 +242,27 @@ int sp_resume(struct sp_device *sp)
 }
 #endif
 
+struct sp_device *sp_get_psp_master_device(void)
+{
+	struct sp_device *i, *ret = NULL;
+	unsigned long flags;
+
+	write_lock_irqsave(&sp_unit_lock, flags);
+	if (list_empty(&sp_units))
+		goto unlock;
+
+	list_for_each_entry(i, &sp_units, entry) {
+		if (i->psp_data)
+			break;
+	}
+
+	if (i->get_psp_master_device)
+		ret = i->get_psp_master_device();
+unlock:
+	write_unlock_irqrestore(&sp_unit_lock, flags);
+	return ret;
+}
+
 static int __init sp_mod_init(void)
 {
 #ifdef CONFIG_X86
diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h
index 5ab486ade1ad9..909cf3e436b42 100644
--- a/drivers/crypto/ccp/sp-dev.h
+++ b/drivers/crypto/ccp/sp-dev.h
@@ -42,12 +42,17 @@ struct ccp_vdata {
 	const unsigned int offset;
 	const unsigned int rsamax;
 };
+
+struct psp_vdata {
+	const unsigned int offset;
+};
+
 /* Structure to hold SP device data */
 struct sp_dev_vdata {
 	const unsigned int bar;
 
 	const struct ccp_vdata *ccp_vdata;
-	void *psp_vdata;
+	const struct psp_vdata *psp_vdata;
 };
 
 struct sp_device {
@@ -68,6 +73,10 @@ struct sp_device {
 	/* DMA caching attribute support */
 	unsigned int axcache;
 
+	/* get and set master device */
+	struct sp_device*(*get_psp_master_device)(void);
+	void (*set_psp_master_device)(struct sp_device *);
+
 	bool irq_registered;
 	bool use_tasklet;
 
@@ -103,6 +112,7 @@ void sp_free_ccp_irq(struct sp_device *sp, void *data);
 int sp_request_psp_irq(struct sp_device *sp, irq_handler_t handler,
 		       const char *name, void *data);
 void sp_free_psp_irq(struct sp_device *sp, void *data);
+struct sp_device *sp_get_psp_master_device(void);
 
 #ifdef CONFIG_CRYPTO_DEV_SP_CCP
 
@@ -130,4 +140,16 @@ static inline int ccp_dev_resume(struct sp_device *sp)
 }
 #endif	/* CONFIG_CRYPTO_DEV_SP_CCP */
 
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+
+int psp_dev_init(struct sp_device *sp);
+void psp_dev_destroy(struct sp_device *sp);
+
+#else /* !CONFIG_CRYPTO_DEV_SP_PSP */
+
+static inline int psp_dev_init(struct sp_device *sp) { return 0; }
+static inline void psp_dev_destroy(struct sp_device *sp) { }
+
+#endif /* CONFIG_CRYPTO_DEV_SP_PSP */
+
 #endif
diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c
index 9859aa683a283..f5f43c50698ac 100644
--- a/drivers/crypto/ccp/sp-pci.c
+++ b/drivers/crypto/ccp/sp-pci.c
@@ -25,6 +25,7 @@
 #include <linux/ccp.h>
 
 #include "ccp-dev.h"
+#include "psp-dev.h"
 
 #define MSIX_VECTORS			2
 
@@ -32,6 +33,7 @@ struct sp_pci {
 	int msix_count;
 	struct msix_entry msix_entry[MSIX_VECTORS];
 };
+static struct sp_device *sp_dev_master;
 
 static int sp_get_msix_irqs(struct sp_device *sp)
 {
@@ -108,6 +110,45 @@ static void sp_free_irqs(struct sp_device *sp)
 	sp->psp_irq = 0;
 }
 
+static bool sp_pci_is_master(struct sp_device *sp)
+{
+	struct device *dev_cur, *dev_new;
+	struct pci_dev *pdev_cur, *pdev_new;
+
+	dev_new = sp->dev;
+	dev_cur = sp_dev_master->dev;
+
+	pdev_new = to_pci_dev(dev_new);
+	pdev_cur = to_pci_dev(dev_cur);
+
+	if (pdev_new->bus->number < pdev_cur->bus->number)
+		return true;
+
+	if (PCI_SLOT(pdev_new->devfn) < PCI_SLOT(pdev_cur->devfn))
+		return true;
+
+	if (PCI_FUNC(pdev_new->devfn) < PCI_FUNC(pdev_cur->devfn))
+		return true;
+
+	return false;
+}
+
+static void psp_set_master(struct sp_device *sp)
+{
+	if (!sp_dev_master) {
+		sp_dev_master = sp;
+		return;
+	}
+
+	if (sp_pci_is_master(sp))
+		sp_dev_master = sp;
+}
+
+static struct sp_device *psp_get_master(void)
+{
+	return sp_dev_master;
+}
+
 static int sp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct sp_device *sp;
@@ -166,6 +207,8 @@ static int sp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto e_err;
 
 	pci_set_master(pdev);
+	sp->set_psp_master_device = psp_set_master;
+	sp->get_psp_master_device = psp_get_master;
 
 	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48));
 	if (ret) {
@@ -225,6 +268,12 @@ static int sp_pci_resume(struct pci_dev *pdev)
 }
 #endif
 
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+static const struct psp_vdata psp_entry = {
+	.offset = 0x10500,
+};
+#endif
+
 static const struct sp_dev_vdata dev_vdata[] = {
 	{
 		.bar = 2,
@@ -236,6 +285,9 @@ static const struct sp_dev_vdata dev_vdata[] = {
 		.bar = 2,
 #ifdef CONFIG_CRYPTO_DEV_SP_CCP
 		.ccp_vdata = &ccpv5a,
+#endif
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+		.psp_vdata = &psp_entry
 #endif
 	},
 	{

From 200664d5237f3f8cd2a2f9f5c5dea08502336bd1 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:28 -0600
Subject: [PATCH 018/676] crypto: ccp: Add Secure Encrypted Virtualization
 (SEV) command support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AMD's new Secure Encrypted Virtualization (SEV) feature allows the
memory contents of virtual machines to be transparently encrypted with a
key unique to the VM. The programming and management of the encryption
keys are handled by the AMD Secure Processor (AMD-SP) which exposes the
commands for these tasks. The complete spec is available at:

http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf

Extend the AMD-SP driver to provide the following support:

 - an in-kernel API to communicate with the SEV firmware. The API can be
   used by the hypervisor to create encryption context for a SEV guest.

 - a userspace IOCTL to manage the platform certificates.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 344 +++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/psp-dev.h |  24 +++
 drivers/crypto/ccp/sp-dev.c  |   9 +
 drivers/crypto/ccp/sp-dev.h  |   4 +
 include/linux/psp-sev.h      | 137 ++++++++++++++
 5 files changed, 518 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index b5789f8785603..9915a6c604a36 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -26,6 +26,12 @@
 #include "sp-dev.h"
 #include "psp-dev.h"
 
+#define DEVICE_NAME	"sev"
+
+static DEFINE_MUTEX(sev_cmd_mutex);
+static struct sev_misc_dev *misc_dev;
+static struct psp_device *psp_master;
+
 static struct psp_device *psp_alloc_struct(struct sp_device *sp)
 {
 	struct device *dev = sp->dev;
@@ -45,9 +51,285 @@ static struct psp_device *psp_alloc_struct(struct sp_device *sp)
 
 static irqreturn_t psp_irq_handler(int irq, void *data)
 {
+	struct psp_device *psp = data;
+	unsigned int status;
+	int reg;
+
+	/* Read the interrupt status: */
+	status = ioread32(psp->io_regs + PSP_P2CMSG_INTSTS);
+
+	/* Check if it is command completion: */
+	if (!(status & BIT(PSP_CMD_COMPLETE_REG)))
+		goto done;
+
+	/* Check if it is SEV command completion: */
+	reg = ioread32(psp->io_regs + PSP_CMDRESP);
+	if (reg & PSP_CMDRESP_RESP) {
+		psp->sev_int_rcvd = 1;
+		wake_up(&psp->sev_int_queue);
+	}
+
+done:
+	/* Clear the interrupt status by writing the same value we read. */
+	iowrite32(status, psp->io_regs + PSP_P2CMSG_INTSTS);
+
 	return IRQ_HANDLED;
 }
 
+static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg)
+{
+	psp->sev_int_rcvd = 0;
+
+	wait_event(psp->sev_int_queue, psp->sev_int_rcvd);
+	*reg = ioread32(psp->io_regs + PSP_CMDRESP);
+}
+
+static int sev_cmd_buffer_len(int cmd)
+{
+	switch (cmd) {
+	case SEV_CMD_INIT:			return sizeof(struct sev_data_init);
+	case SEV_CMD_PLATFORM_STATUS:		return sizeof(struct sev_user_data_status);
+	case SEV_CMD_PEK_CSR:			return sizeof(struct sev_data_pek_csr);
+	case SEV_CMD_PEK_CERT_IMPORT:		return sizeof(struct sev_data_pek_cert_import);
+	case SEV_CMD_PDH_CERT_EXPORT:		return sizeof(struct sev_data_pdh_cert_export);
+	case SEV_CMD_LAUNCH_START:		return sizeof(struct sev_data_launch_start);
+	case SEV_CMD_LAUNCH_UPDATE_DATA:	return sizeof(struct sev_data_launch_update_data);
+	case SEV_CMD_LAUNCH_UPDATE_VMSA:	return sizeof(struct sev_data_launch_update_vmsa);
+	case SEV_CMD_LAUNCH_FINISH:		return sizeof(struct sev_data_launch_finish);
+	case SEV_CMD_LAUNCH_MEASURE:		return sizeof(struct sev_data_launch_measure);
+	case SEV_CMD_ACTIVATE:			return sizeof(struct sev_data_activate);
+	case SEV_CMD_DEACTIVATE:		return sizeof(struct sev_data_deactivate);
+	case SEV_CMD_DECOMMISSION:		return sizeof(struct sev_data_decommission);
+	case SEV_CMD_GUEST_STATUS:		return sizeof(struct sev_data_guest_status);
+	case SEV_CMD_DBG_DECRYPT:		return sizeof(struct sev_data_dbg);
+	case SEV_CMD_DBG_ENCRYPT:		return sizeof(struct sev_data_dbg);
+	case SEV_CMD_SEND_START:		return sizeof(struct sev_data_send_start);
+	case SEV_CMD_SEND_UPDATE_DATA:		return sizeof(struct sev_data_send_update_data);
+	case SEV_CMD_SEND_UPDATE_VMSA:		return sizeof(struct sev_data_send_update_vmsa);
+	case SEV_CMD_SEND_FINISH:		return sizeof(struct sev_data_send_finish);
+	case SEV_CMD_RECEIVE_START:		return sizeof(struct sev_data_receive_start);
+	case SEV_CMD_RECEIVE_FINISH:		return sizeof(struct sev_data_receive_finish);
+	case SEV_CMD_RECEIVE_UPDATE_DATA:	return sizeof(struct sev_data_receive_update_data);
+	case SEV_CMD_RECEIVE_UPDATE_VMSA:	return sizeof(struct sev_data_receive_update_vmsa);
+	case SEV_CMD_LAUNCH_UPDATE_SECRET:	return sizeof(struct sev_data_launch_secret);
+	default:				return 0;
+	}
+
+	return 0;
+}
+
+static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
+{
+	struct psp_device *psp = psp_master;
+	unsigned int phys_lsb, phys_msb;
+	unsigned int reg, ret = 0;
+
+	if (!psp)
+		return -ENODEV;
+
+	/* Get the physical address of the command buffer */
+	phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0;
+	phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
+
+	dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n",
+		cmd, phys_msb, phys_lsb);
+
+	print_hex_dump_debug("(in):  ", DUMP_PREFIX_OFFSET, 16, 2, data,
+			     sev_cmd_buffer_len(cmd), false);
+
+	iowrite32(phys_lsb, psp->io_regs + PSP_CMDBUFF_ADDR_LO);
+	iowrite32(phys_msb, psp->io_regs + PSP_CMDBUFF_ADDR_HI);
+
+	reg = cmd;
+	reg <<= PSP_CMDRESP_CMD_SHIFT;
+	reg |= PSP_CMDRESP_IOC;
+	iowrite32(reg, psp->io_regs + PSP_CMDRESP);
+
+	/* wait for command completion */
+	sev_wait_cmd_ioc(psp, &reg);
+
+	if (psp_ret)
+		*psp_ret = reg & PSP_CMDRESP_ERR_MASK;
+
+	if (reg & PSP_CMDRESP_ERR_MASK) {
+		dev_dbg(psp->dev, "sev command %#x failed (%#010x)\n",
+			cmd, reg & PSP_CMDRESP_ERR_MASK);
+		ret = -EIO;
+	}
+
+	print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
+			     sev_cmd_buffer_len(cmd), false);
+
+	return ret;
+}
+
+static int sev_do_cmd(int cmd, void *data, int *psp_ret)
+{
+	int rc;
+
+	mutex_lock(&sev_cmd_mutex);
+	rc = __sev_do_cmd_locked(cmd, data, psp_ret);
+	mutex_unlock(&sev_cmd_mutex);
+
+	return rc;
+}
+
+static int __sev_platform_init_locked(int *error)
+{
+	struct psp_device *psp = psp_master;
+	int rc = 0;
+
+	if (!psp)
+		return -ENODEV;
+
+	if (psp->sev_state == SEV_STATE_INIT)
+		return 0;
+
+	rc = __sev_do_cmd_locked(SEV_CMD_INIT, &psp->init_cmd_buf, error);
+	if (rc)
+		return rc;
+
+	psp->sev_state = SEV_STATE_INIT;
+	dev_dbg(psp->dev, "SEV firmware initialized\n");
+
+	return rc;
+}
+
+int sev_platform_init(int *error)
+{
+	int rc;
+
+	mutex_lock(&sev_cmd_mutex);
+	rc = __sev_platform_init_locked(error);
+	mutex_unlock(&sev_cmd_mutex);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(sev_platform_init);
+
+static int __sev_platform_shutdown_locked(int *error)
+{
+	int ret;
+
+	ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, 0, error);
+	if (ret)
+		return ret;
+
+	psp_master->sev_state = SEV_STATE_UNINIT;
+	dev_dbg(psp_master->dev, "SEV firmware shutdown\n");
+
+	return ret;
+}
+
+static int sev_platform_shutdown(int *error)
+{
+	int rc;
+
+	mutex_lock(&sev_cmd_mutex);
+	rc = __sev_platform_shutdown_locked(NULL);
+	mutex_unlock(&sev_cmd_mutex);
+
+	return rc;
+}
+
+static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static const struct file_operations sev_fops = {
+	.owner	= THIS_MODULE,
+	.unlocked_ioctl = sev_ioctl,
+};
+
+int sev_platform_status(struct sev_user_data_status *data, int *error)
+{
+	return sev_do_cmd(SEV_CMD_PLATFORM_STATUS, data, error);
+}
+EXPORT_SYMBOL_GPL(sev_platform_status);
+
+int sev_guest_deactivate(struct sev_data_deactivate *data, int *error)
+{
+	return sev_do_cmd(SEV_CMD_DEACTIVATE, data, error);
+}
+EXPORT_SYMBOL_GPL(sev_guest_deactivate);
+
+int sev_guest_activate(struct sev_data_activate *data, int *error)
+{
+	return sev_do_cmd(SEV_CMD_ACTIVATE, data, error);
+}
+EXPORT_SYMBOL_GPL(sev_guest_activate);
+
+int sev_guest_decommission(struct sev_data_decommission *data, int *error)
+{
+	return sev_do_cmd(SEV_CMD_DECOMMISSION, data, error);
+}
+EXPORT_SYMBOL_GPL(sev_guest_decommission);
+
+int sev_guest_df_flush(int *error)
+{
+	return sev_do_cmd(SEV_CMD_DF_FLUSH, 0, error);
+}
+EXPORT_SYMBOL_GPL(sev_guest_df_flush);
+
+static void sev_exit(struct kref *ref)
+{
+	struct sev_misc_dev *misc_dev = container_of(ref, struct sev_misc_dev, refcount);
+
+	misc_deregister(&misc_dev->misc);
+}
+
+static int sev_misc_init(struct psp_device *psp)
+{
+	struct device *dev = psp->dev;
+	int ret;
+
+	/*
+	 * SEV feature support can be detected on multiple devices but the SEV
+	 * FW commands must be issued on the master. During probe, we do not
+	 * know the master hence we create /dev/sev on the first device probe.
+	 * sev_do_cmd() finds the right master device to which to issue the
+	 * command to the firmware.
+	 */
+	if (!misc_dev) {
+		struct miscdevice *misc;
+
+		misc_dev = devm_kzalloc(dev, sizeof(*misc_dev), GFP_KERNEL);
+		if (!misc_dev)
+			return -ENOMEM;
+
+		misc = &misc_dev->misc;
+		misc->minor = MISC_DYNAMIC_MINOR;
+		misc->name = DEVICE_NAME;
+		misc->fops = &sev_fops;
+
+		ret = misc_register(misc);
+		if (ret)
+			return ret;
+
+		kref_init(&misc_dev->refcount);
+	} else {
+		kref_get(&misc_dev->refcount);
+	}
+
+	init_waitqueue_head(&psp->sev_int_queue);
+	psp->sev_misc = misc_dev;
+	dev_dbg(dev, "registered SEV device\n");
+
+	return 0;
+}
+
+static int sev_init(struct psp_device *psp)
+{
+	/* Check if device supports SEV feature */
+	if (!(ioread32(psp->io_regs + PSP_FEATURE_REG) & 1)) {
+		dev_dbg(psp->dev, "device does not support SEV\n");
+		return 1;
+	}
+
+	return sev_misc_init(psp);
+}
+
 int psp_dev_init(struct sp_device *sp)
 {
 	struct device *dev = sp->dev;
@@ -81,6 +363,10 @@ int psp_dev_init(struct sp_device *sp)
 		goto e_err;
 	}
 
+	ret = sev_init(psp);
+	if (ret)
+		goto e_irq;
+
 	if (sp->set_psp_master_device)
 		sp->set_psp_master_device(sp);
 
@@ -89,6 +375,8 @@ int psp_dev_init(struct sp_device *sp)
 
 	return 0;
 
+e_irq:
+	sp_free_psp_irq(psp->sp, psp);
 e_err:
 	sp->psp_data = NULL;
 
@@ -101,5 +389,61 @@ void psp_dev_destroy(struct sp_device *sp)
 {
 	struct psp_device *psp = sp->psp_data;
 
+	if (psp->sev_misc)
+		kref_put(&misc_dev->refcount, sev_exit);
+
 	sp_free_psp_irq(sp, psp);
 }
+
+int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd,
+				void *data, int *error)
+{
+	if (!filep || filep->f_op != &sev_fops)
+		return -EBADF;
+
+	return  sev_do_cmd(cmd, data, error);
+}
+EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user);
+
+void psp_pci_init(void)
+{
+	struct sev_user_data_status *status;
+	struct sp_device *sp;
+	int error, rc;
+
+	sp = sp_get_psp_master_device();
+	if (!sp)
+		return;
+
+	psp_master = sp->psp_data;
+
+	/* Initialize the platform */
+	rc = sev_platform_init(&error);
+	if (rc) {
+		dev_err(sp->dev, "SEV: failed to INIT error %#x\n", error);
+		goto err;
+	}
+
+	/* Display SEV firmware version */
+	status = &psp_master->status_cmd_buf;
+	rc = sev_platform_status(status, &error);
+	if (rc) {
+		dev_err(sp->dev, "SEV: failed to get status error %#x\n", error);
+		goto err;
+	}
+
+	dev_info(sp->dev, "SEV API:%d.%d build:%d\n", status->api_major,
+		 status->api_minor, status->build);
+	return;
+
+err:
+	psp_master = NULL;
+}
+
+void psp_pci_exit(void)
+{
+	if (!psp_master)
+		return;
+
+	sev_platform_shutdown(NULL);
+}
diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h
index 55b7808367c3a..c81f0b11287ae 100644
--- a/drivers/crypto/ccp/psp-dev.h
+++ b/drivers/crypto/ccp/psp-dev.h
@@ -25,9 +25,21 @@
 #include <linux/interrupt.h>
 #include <linux/irqreturn.h>
 #include <linux/dmaengine.h>
+#include <linux/psp-sev.h>
+#include <linux/miscdevice.h>
 
 #include "sp-dev.h"
 
+#define PSP_C2PMSG(_num)		((_num) << 2)
+#define PSP_CMDRESP			PSP_C2PMSG(32)
+#define PSP_CMDBUFF_ADDR_LO		PSP_C2PMSG(56)
+#define PSP_CMDBUFF_ADDR_HI             PSP_C2PMSG(57)
+#define PSP_FEATURE_REG			PSP_C2PMSG(63)
+
+#define PSP_P2CMSG(_num)		((_num) << 2)
+#define PSP_CMD_COMPLETE_REG		1
+#define PSP_CMD_COMPLETE		PSP_P2CMSG(PSP_CMD_COMPLETE_REG)
+
 #define PSP_P2CMSG_INTEN		0x0110
 #define PSP_P2CMSG_INTSTS		0x0114
 
@@ -44,6 +56,11 @@
 
 #define MAX_PSP_NAME_LEN		16
 
+struct sev_misc_dev {
+	struct kref refcount;
+	struct miscdevice misc;
+};
+
 struct psp_device {
 	struct list_head entry;
 
@@ -54,6 +71,13 @@ struct psp_device {
 	struct sp_device *sp;
 
 	void __iomem *io_regs;
+
+	int sev_state;
+	unsigned int sev_int_rcvd;
+	wait_queue_head_t sev_int_queue;
+	struct sev_misc_dev *sev_misc;
+	struct sev_user_data_status status_cmd_buf;
+	struct sev_data_init init_cmd_buf;
 };
 
 #endif /* __PSP_DEV_H */
diff --git a/drivers/crypto/ccp/sp-dev.c b/drivers/crypto/ccp/sp-dev.c
index cf101c039c8ff..eb0da65727204 100644
--- a/drivers/crypto/ccp/sp-dev.c
+++ b/drivers/crypto/ccp/sp-dev.c
@@ -272,6 +272,10 @@ static int __init sp_mod_init(void)
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+	psp_pci_init();
+#endif
+
 	return 0;
 #endif
 
@@ -291,6 +295,11 @@ static int __init sp_mod_init(void)
 static void __exit sp_mod_exit(void)
 {
 #ifdef CONFIG_X86
+
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+	psp_pci_exit();
+#endif
+
 	sp_pci_exit();
 #endif
 
diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h
index 909cf3e436b42..acb197b66ced9 100644
--- a/drivers/crypto/ccp/sp-dev.h
+++ b/drivers/crypto/ccp/sp-dev.h
@@ -143,12 +143,16 @@ static inline int ccp_dev_resume(struct sp_device *sp)
 #ifdef CONFIG_CRYPTO_DEV_SP_PSP
 
 int psp_dev_init(struct sp_device *sp);
+void psp_pci_init(void);
 void psp_dev_destroy(struct sp_device *sp);
+void psp_pci_exit(void);
 
 #else /* !CONFIG_CRYPTO_DEV_SP_PSP */
 
 static inline int psp_dev_init(struct sp_device *sp) { return 0; }
+static inline void psp_pci_init(void) { }
 static inline void psp_dev_destroy(struct sp_device *sp) { }
+static inline void psp_pci_exit(void) { }
 
 #endif /* CONFIG_CRYPTO_DEV_SP_PSP */
 
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 4a150d17d537e..0b6dd306d88bf 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -462,4 +462,141 @@ struct sev_data_dbg {
 	u32 len;				/* In */
 } __packed;
 
+#ifdef CONFIG_CRYPTO_DEV_SP_PSP
+
+/**
+ * sev_platform_init - perform SEV INIT command
+ *
+ * @error: SEV command return code
+ *
+ * Returns:
+ * 0 if the SEV successfully processed the command
+ * -%ENODEV    if the SEV device is not available
+ * -%ENOTSUPP  if the SEV does not support SEV
+ * -%ETIMEDOUT if the SEV command timed out
+ * -%EIO       if the SEV returned a non-zero return code
+ */
+int sev_platform_init(int *error);
+
+/**
+ * sev_platform_status - perform SEV PLATFORM_STATUS command
+ *
+ * @status: sev_user_data_status structure to be processed
+ * @error: SEV command return code
+ *
+ * Returns:
+ * 0 if the SEV successfully processed the command
+ * -%ENODEV    if the SEV device is not available
+ * -%ENOTSUPP  if the SEV does not support SEV
+ * -%ETIMEDOUT if the SEV command timed out
+ * -%EIO       if the SEV returned a non-zero return code
+ */
+int sev_platform_status(struct sev_user_data_status *status, int *error);
+
+/**
+ * sev_issue_cmd_external_user - issue SEV command by other driver with a file
+ * handle.
+ *
+ * This function can be used by other drivers to issue a SEV command on
+ * behalf of userspace. The caller must pass a valid SEV file descriptor
+ * so that we know that it has access to SEV device.
+ *
+ * @filep - SEV device file pointer
+ * @cmd - command to issue
+ * @data - command buffer
+ * @error: SEV command return code
+ *
+ * Returns:
+ * 0 if the SEV successfully processed the command
+ * -%ENODEV    if the SEV device is not available
+ * -%ENOTSUPP  if the SEV does not support SEV
+ * -%ETIMEDOUT if the SEV command timed out
+ * -%EIO       if the SEV returned a non-zero return code
+ * -%EINVAL    if the SEV file descriptor is not valid
+ */
+int sev_issue_cmd_external_user(struct file *filep, unsigned int id,
+				void *data, int *error);
+
+/**
+ * sev_guest_deactivate - perform SEV DEACTIVATE command
+ *
+ * @deactivate: sev_data_deactivate structure to be processed
+ * @sev_ret: sev command return code
+ *
+ * Returns:
+ * 0 if the sev successfully processed the command
+ * -%ENODEV    if the sev device is not available
+ * -%ENOTSUPP  if the sev does not support SEV
+ * -%ETIMEDOUT if the sev command timed out
+ * -%EIO       if the sev returned a non-zero return code
+ */
+int sev_guest_deactivate(struct sev_data_deactivate *data, int *error);
+
+/**
+ * sev_guest_activate - perform SEV ACTIVATE command
+ *
+ * @activate: sev_data_activate structure to be processed
+ * @sev_ret: sev command return code
+ *
+ * Returns:
+ * 0 if the sev successfully processed the command
+ * -%ENODEV    if the sev device is not available
+ * -%ENOTSUPP  if the sev does not support SEV
+ * -%ETIMEDOUT if the sev command timed out
+ * -%EIO       if the sev returned a non-zero return code
+ */
+int sev_guest_activate(struct sev_data_activate *data, int *error);
+
+/**
+ * sev_guest_df_flush - perform SEV DF_FLUSH command
+ *
+ * @sev_ret: sev command return code
+ *
+ * Returns:
+ * 0 if the sev successfully processed the command
+ * -%ENODEV    if the sev device is not available
+ * -%ENOTSUPP  if the sev does not support SEV
+ * -%ETIMEDOUT if the sev command timed out
+ * -%EIO       if the sev returned a non-zero return code
+ */
+int sev_guest_df_flush(int *error);
+
+/**
+ * sev_guest_decommission - perform SEV DECOMMISSION command
+ *
+ * @decommission: sev_data_decommission structure to be processed
+ * @sev_ret: sev command return code
+ *
+ * Returns:
+ * 0 if the sev successfully processed the command
+ * -%ENODEV    if the sev device is not available
+ * -%ENOTSUPP  if the sev does not support SEV
+ * -%ETIMEDOUT if the sev command timed out
+ * -%EIO       if the sev returned a non-zero return code
+ */
+int sev_guest_decommission(struct sev_data_decommission *data, int *error);
+
+#else	/* !CONFIG_CRYPTO_DEV_SP_PSP */
+
+static inline int
+sev_platform_status(struct sev_user_data_status *status, int *error) { return -ENODEV; }
+
+static inline int sev_platform_init(int *error) { return -ENODEV; }
+
+static inline int
+sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENODEV; }
+
+static inline int
+sev_guest_decommission(struct sev_data_decommission *data, int *error) { return -ENODEV; }
+
+static inline int
+sev_guest_activate(struct sev_data_activate *data, int *error) { return -ENODEV; }
+
+static inline int sev_guest_df_flush(int *error) { return -ENODEV; }
+
+static inline int
+sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int *error) { return -ENODEV; }
+
+#endif	/* CONFIG_CRYPTO_DEV_SP_PSP */
+
 #endif	/* __PSP_SEV_H__ */

From 2960f9a51556f79a1f9a70c5ed5a52476f62f1a1 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:29 -0600
Subject: [PATCH 019/676] crypto: ccp: Implement SEV_FACTORY_RESET ioctl
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_FACTORY_RESET command can be used by the platform owner to
reset the non-volatile SEV related data. The command is defined in
SEV spec section 5.4

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 77 +++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 9915a6c604a36..b49583a45a55e 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -232,9 +232,84 @@ static int sev_platform_shutdown(int *error)
 	return rc;
 }
 
+static int sev_get_platform_state(int *state, int *error)
+{
+	int rc;
+
+	rc = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS,
+				 &psp_master->status_cmd_buf, error);
+	if (rc)
+		return rc;
+
+	*state = psp_master->status_cmd_buf.state;
+	return rc;
+}
+
+static int sev_ioctl_do_reset(struct sev_issue_cmd *argp)
+{
+	int state, rc;
+
+	/*
+	 * The SEV spec requires that FACTORY_RESET must be issued in
+	 * UNINIT state. Before we go further lets check if any guest is
+	 * active.
+	 *
+	 * If FW is in WORKING state then deny the request otherwise issue
+	 * SHUTDOWN command do INIT -> UNINIT before issuing the FACTORY_RESET.
+	 *
+	 */
+	rc = sev_get_platform_state(&state, &argp->error);
+	if (rc)
+		return rc;
+
+	if (state == SEV_STATE_WORKING)
+		return -EBUSY;
+
+	if (state == SEV_STATE_INIT) {
+		rc = __sev_platform_shutdown_locked(&argp->error);
+		if (rc)
+			return rc;
+	}
+
+	return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error);
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
-	return -ENOTTY;
+	void __user *argp = (void __user *)arg;
+	struct sev_issue_cmd input;
+	int ret = -EFAULT;
+
+	if (!psp_master)
+		return -ENODEV;
+
+	if (ioctl != SEV_ISSUE_CMD)
+		return -EINVAL;
+
+	if (copy_from_user(&input, argp, sizeof(struct sev_issue_cmd)))
+		return -EFAULT;
+
+	if (input.cmd > SEV_MAX)
+		return -EINVAL;
+
+	mutex_lock(&sev_cmd_mutex);
+
+	switch (input.cmd) {
+
+	case SEV_FACTORY_RESET:
+		ret = sev_ioctl_do_reset(&input);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (copy_to_user(argp, &input, sizeof(struct sev_issue_cmd)))
+		ret = -EFAULT;
+out:
+	mutex_unlock(&sev_cmd_mutex);
+
+	return ret;
 }
 
 static const struct file_operations sev_fops = {

From efe1829b1a8fb7a8e69791141b8c8f6291708863 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:29 -0600
Subject: [PATCH 020/676] crypto: ccp: Implement SEV_PLATFORM_STATUS ioctl
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PLATFORM_STATUS command can be used by the platform owner to
get the current status of the platform. The command is defined in
SEV spec section 5.5.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index b49583a45a55e..a5072b166ab84 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -274,6 +274,21 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp)
 	return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error);
 }
 
+static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
+{
+	struct sev_user_data_status *data = &psp_master->status_cmd_buf;
+	int ret;
+
+	ret = __sev_do_cmd_locked(SEV_CMD_PLATFORM_STATUS, data, &argp->error);
+	if (ret)
+		return ret;
+
+	if (copy_to_user((void __user *)argp->data, data, sizeof(*data)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -299,6 +314,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_FACTORY_RESET:
 		ret = sev_ioctl_do_reset(&input);
 		break;
+	case SEV_PLATFORM_STATUS:
+		ret = sev_ioctl_do_platform_status(&input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From 4d84b726be834d4991b94fd51df66ec299b66d45 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:30 -0600
Subject: [PATCH 021/676] crypto: ccp: Implement SEV_PEK_GEN ioctl command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PEK_GEN command is used to generate a new Platform Endorsement
Key (PEK). The command is defined in SEV spec section 5.6.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Borislav Petkov <bp@suse.de>
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index a5072b166ab84..8aa8036023e0b 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -289,6 +289,19 @@ static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp)
 	return ret;
 }
 
+static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp)
+{
+	int rc;
+
+	if (psp_master->sev_state == SEV_STATE_UNINIT) {
+		rc = __sev_platform_init_locked(&argp->error);
+		if (rc)
+			return rc;
+	}
+
+	return __sev_do_cmd_locked(cmd, 0, &argp->error);
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -317,6 +330,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PLATFORM_STATUS:
 		ret = sev_ioctl_do_platform_status(&input);
 		break;
+	case SEV_PEK_GEN:
+		ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PEK_GEN, &input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From 77f65327228f001bf1a43eb09dc96bbdba5b4eac Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:30 -0600
Subject: [PATCH 022/676] crypto: ccp: Implement SEV_PDH_GEN ioctl command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PDH_GEN command is used to re-generate the Platform
Diffie-Hellman (PDH) key. The command is defined in SEV spec section
5.6.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 8aa8036023e0b..fd3daf0a1176e 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -333,6 +333,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PEK_GEN:
 		ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PEK_GEN, &input);
 		break;
+	case SEV_PDH_GEN:
+		ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PDH_GEN, &input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From e799035609e1526761aa2f896a974b233d04d36d Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:31 -0600
Subject: [PATCH 023/676] crypto: ccp: Implement SEV_PEK_CSR ioctl command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PEK_CSR command can be used to generate a PEK certificate
signing request. The command is defined in SEV spec section 5.7.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 66 ++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index fd3daf0a1176e..c3906bbdb69be 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -302,6 +302,69 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp)
 	return __sev_do_cmd_locked(cmd, 0, &argp->error);
 }
 
+static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp)
+{
+	struct sev_user_data_pek_csr input;
+	struct sev_data_pek_csr *data;
+	void *blob = NULL;
+	int ret;
+
+	if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* userspace wants to query CSR length */
+	if (!input.address || !input.length)
+		goto cmd;
+
+	/* allocate a physically contiguous buffer to store the CSR blob */
+	if (!access_ok(VERIFY_WRITE, input.address, input.length) ||
+	    input.length > SEV_FW_BLOB_MAX_SIZE) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	blob = kmalloc(input.length, GFP_KERNEL);
+	if (!blob) {
+		ret = -ENOMEM;
+		goto e_free;
+	}
+
+	data->address = __psp_pa(blob);
+	data->len = input.length;
+
+cmd:
+	if (psp_master->sev_state == SEV_STATE_UNINIT) {
+		ret = __sev_platform_init_locked(&argp->error);
+		if (ret)
+			goto e_free_blob;
+	}
+
+	ret = __sev_do_cmd_locked(SEV_CMD_PEK_CSR, data, &argp->error);
+
+	 /* If we query the CSR length, FW responded with expected data. */
+	input.length = data->len;
+
+	if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
+		ret = -EFAULT;
+		goto e_free_blob;
+	}
+
+	if (blob) {
+		if (copy_to_user((void __user *)input.address, blob, input.length))
+			ret = -EFAULT;
+	}
+
+e_free_blob:
+	kfree(blob);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -336,6 +399,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PDH_GEN:
 		ret = sev_ioctl_do_pek_pdh_gen(SEV_CMD_PDH_GEN, &input);
 		break;
+	case SEV_PEK_CSR:
+		ret = sev_ioctl_do_pek_csr(&input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From 7360e4b14350a3a461645e0413ce58fbc8785fd8 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:31 -0600
Subject: [PATCH 024/676] crypto: ccp: Implement SEV_PEK_CERT_IMPORT ioctl
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PEK_CERT_IMPORT command can be used to import the signed PEK
certificate. The command is defined in SEV spec section 5.8.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Acked-by: Gary R Hook <gary.hook@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 drivers/crypto/ccp/psp-dev.c | 81 ++++++++++++++++++++++++++++++++++++
 include/linux/psp-sev.h      |  4 ++
 2 files changed, 85 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index c3906bbdb69be..9d1c4600db19d 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -365,6 +365,84 @@ static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp)
 	return ret;
 }
 
+void *psp_copy_user_blob(u64 __user uaddr, u32 len)
+{
+	void *data;
+
+	if (!uaddr || !len)
+		return ERR_PTR(-EINVAL);
+
+	/* verify that blob length does not exceed our limit */
+	if (len > SEV_FW_BLOB_MAX_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(data, (void __user *)(uintptr_t)uaddr, len))
+		goto e_free;
+
+	return data;
+
+e_free:
+	kfree(data);
+	return ERR_PTR(-EFAULT);
+}
+EXPORT_SYMBOL_GPL(psp_copy_user_blob);
+
+static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp)
+{
+	struct sev_user_data_pek_cert_import input;
+	struct sev_data_pek_cert_import *data;
+	void *pek_blob, *oca_blob;
+	int ret;
+
+	if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* copy PEK certificate blobs from userspace */
+	pek_blob = psp_copy_user_blob(input.pek_cert_address, input.pek_cert_len);
+	if (IS_ERR(pek_blob)) {
+		ret = PTR_ERR(pek_blob);
+		goto e_free;
+	}
+
+	data->pek_cert_address = __psp_pa(pek_blob);
+	data->pek_cert_len = input.pek_cert_len;
+
+	/* copy PEK certificate blobs from userspace */
+	oca_blob = psp_copy_user_blob(input.oca_cert_address, input.oca_cert_len);
+	if (IS_ERR(oca_blob)) {
+		ret = PTR_ERR(oca_blob);
+		goto e_free_pek;
+	}
+
+	data->oca_cert_address = __psp_pa(oca_blob);
+	data->oca_cert_len = input.oca_cert_len;
+
+	/* If platform is not in INIT state then transition it to INIT */
+	if (psp_master->sev_state != SEV_STATE_INIT) {
+		ret = __sev_platform_init_locked(&argp->error);
+		if (ret)
+			goto e_free_oca;
+	}
+
+	ret = __sev_do_cmd_locked(SEV_CMD_PEK_CERT_IMPORT, data, &argp->error);
+
+e_free_oca:
+	kfree(oca_blob);
+e_free_pek:
+	kfree(pek_blob);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -402,6 +480,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PEK_CSR:
 		ret = sev_ioctl_do_pek_csr(&input);
 		break;
+	case SEV_PEK_CERT_IMPORT:
+		ret = sev_ioctl_do_pek_import(&input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 0b6dd306d88bf..93addfa340619 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -576,6 +576,8 @@ int sev_guest_df_flush(int *error);
  */
 int sev_guest_decommission(struct sev_data_decommission *data, int *error);
 
+void *psp_copy_user_blob(u64 __user uaddr, u32 len);
+
 #else	/* !CONFIG_CRYPTO_DEV_SP_PSP */
 
 static inline int
@@ -597,6 +599,8 @@ static inline int sev_guest_df_flush(int *error) { return -ENODEV; }
 static inline int
 sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int *error) { return -ENODEV; }
 
+static inline void *psp_copy_user_blob(u64 __user uaddr, u32 len) { return ERR_PTR(-EINVAL); }
+
 #endif	/* CONFIG_CRYPTO_DEV_SP_PSP */
 
 #endif	/* __PSP_SEV_H__ */

From 76a2b524a4b1d6dc0f2421f9854a01d55d5e5436 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:31 -0600
Subject: [PATCH 025/676] crypto: ccp: Implement SEV_PDH_CERT_EXPORT ioctl
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV_PDH_CERT_EXPORT command can be used to export the PDH and its
certificate chain. The command is defined in SEV spec section 5.10.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Gary Hook <gary.hook@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: linux-crypto@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Acked-by: Gary R Hook <gary.hook@amd.com>
---
 drivers/crypto/ccp/psp-dev.c | 97 ++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c
index 9d1c4600db19d..fcfa5b1eae616 100644
--- a/drivers/crypto/ccp/psp-dev.c
+++ b/drivers/crypto/ccp/psp-dev.c
@@ -443,6 +443,100 @@ static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp)
 	return ret;
 }
 
+static int sev_ioctl_do_pdh_export(struct sev_issue_cmd *argp)
+{
+	struct sev_user_data_pdh_cert_export input;
+	void *pdh_blob = NULL, *cert_blob = NULL;
+	struct sev_data_pdh_cert_export *data;
+	int ret;
+
+	if (copy_from_user(&input, (void __user *)argp->data, sizeof(input)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* Userspace wants to query the certificate length. */
+	if (!input.pdh_cert_address ||
+	    !input.pdh_cert_len ||
+	    !input.cert_chain_address)
+		goto cmd;
+
+	/* Allocate a physically contiguous buffer to store the PDH blob. */
+	if ((input.pdh_cert_len > SEV_FW_BLOB_MAX_SIZE) ||
+	    !access_ok(VERIFY_WRITE, input.pdh_cert_address, input.pdh_cert_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	/* Allocate a physically contiguous buffer to store the cert chain blob. */
+	if ((input.cert_chain_len > SEV_FW_BLOB_MAX_SIZE) ||
+	    !access_ok(VERIFY_WRITE, input.cert_chain_address, input.cert_chain_len)) {
+		ret = -EFAULT;
+		goto e_free;
+	}
+
+	pdh_blob = kmalloc(input.pdh_cert_len, GFP_KERNEL);
+	if (!pdh_blob) {
+		ret = -ENOMEM;
+		goto e_free;
+	}
+
+	data->pdh_cert_address = __psp_pa(pdh_blob);
+	data->pdh_cert_len = input.pdh_cert_len;
+
+	cert_blob = kmalloc(input.cert_chain_len, GFP_KERNEL);
+	if (!cert_blob) {
+		ret = -ENOMEM;
+		goto e_free_pdh;
+	}
+
+	data->cert_chain_address = __psp_pa(cert_blob);
+	data->cert_chain_len = input.cert_chain_len;
+
+cmd:
+	/* If platform is not in INIT state then transition it to INIT. */
+	if (psp_master->sev_state != SEV_STATE_INIT) {
+		ret = __sev_platform_init_locked(&argp->error);
+		if (ret)
+			goto e_free_cert;
+	}
+
+	ret = __sev_do_cmd_locked(SEV_CMD_PDH_CERT_EXPORT, data, &argp->error);
+
+	/* If we query the length, FW responded with expected data. */
+	input.cert_chain_len = data->cert_chain_len;
+	input.pdh_cert_len = data->pdh_cert_len;
+
+	if (copy_to_user((void __user *)argp->data, &input, sizeof(input))) {
+		ret = -EFAULT;
+		goto e_free_cert;
+	}
+
+	if (pdh_blob) {
+		if (copy_to_user((void __user *)input.pdh_cert_address,
+				 pdh_blob, input.pdh_cert_len)) {
+			ret = -EFAULT;
+			goto e_free_cert;
+		}
+	}
+
+	if (cert_blob) {
+		if (copy_to_user((void __user *)input.cert_chain_address,
+				 cert_blob, input.cert_chain_len))
+			ret = -EFAULT;
+	}
+
+e_free_cert:
+	kfree(cert_blob);
+e_free_pdh:
+	kfree(pdh_blob);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 {
 	void __user *argp = (void __user *)arg;
@@ -483,6 +577,9 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
 	case SEV_PEK_CERT_IMPORT:
 		ret = sev_ioctl_do_pek_import(&input);
 		break;
+	case SEV_PDH_CERT_EXPORT:
+		ret = sev_ioctl_do_pdh_export(&input);
+		break;
 	default:
 		ret = -EINVAL;
 		goto out;

From 5dd0a57cf38eeb8b6be1d9c3df9add2f5756d974 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:32 -0600
Subject: [PATCH 026/676] KVM: X86: Add CONFIG_KVM_AMD_SEV
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The config option can be used to enable SEV support on AMD Processors.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 3df51c2878442..148ea324b90c4 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -81,6 +81,16 @@ config KVM_AMD
 	  To compile this as a module, choose M here: the module
 	  will be called kvm-amd.
 
+config KVM_AMD_SEV
+	def_bool y
+	bool "AMD Secure Encrypted Virtualization (SEV) support"
+	depends on KVM_AMD && X86_64
+	select CRYPTO_DEV_CCP
+	select CRYPTO_DEV_CCP_DD
+	select CRYPTO_DEV_SP_PSP
+	---help---
+	Provides support for launching Encrypted VMs on AMD processors.
+
 config KVM_MMU_AUDIT
 	bool "Audit KVM MMU"
 	depends on KVM && TRACEPOINTS

From ed3cd233f8074015e164547abfcdd4678fd10bb4 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:32 -0600
Subject: [PATCH 027/676] KVM: SVM: Reserve ASID range for SEV guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A SEV-enabled guest must use ASIDs from the defined subset, while non-SEV
guests can use the remaining ASID range. The range of allowed SEV guest
ASIDs is [1 - CPUID_8000_001F[ECX][31:0]].

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7e302a25bc455..bff7ce727a782 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -319,6 +319,8 @@ enum {
 
 #define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL
 
+static unsigned int max_sev_asid;
+
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;
@@ -783,7 +785,7 @@ static int svm_hardware_enable(void)
 	sd->asid_generation = 1;
 	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	sd->next_asid = sd->max_asid + 1;
-	sd->min_asid = 1;
+	sd->min_asid = max_sev_asid + 1;
 
 	gdt = get_current_gdt_rw();
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);

From e9df09428996fcdc43e2b0db2a0e8b38198931c4 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:33 -0600
Subject: [PATCH 028/676] KVM: SVM: Add sev module_param
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The module parameter can be used to control the SEV feature support.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bff7ce727a782..ca63642517a70 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -37,6 +37,7 @@
 #include <linux/amd-iommu.h>
 #include <linux/hashtable.h>
 #include <linux/frame.h>
+#include <linux/psp-sev.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -284,6 +285,10 @@ module_param(vls, int, 0444);
 static int vgif = true;
 module_param(vgif, int, 0444);
 
+/* enable/disable SEV support */
+static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
+module_param(sev, int, 0444);
+
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -1049,6 +1054,39 @@ static int avic_ga_log_notifier(u32 ga_tag)
 	return 0;
 }
 
+static __init int sev_hardware_setup(void)
+{
+	struct sev_user_data_status *status;
+	int rc;
+
+	/* Maximum number of encrypted guests supported simultaneously */
+	max_sev_asid = cpuid_ecx(0x8000001F);
+
+	if (!max_sev_asid)
+		return 1;
+
+	status = kmalloc(sizeof(*status), GFP_KERNEL);
+	if (!status)
+		return 1;
+
+	/*
+	 * Check SEV platform status.
+	 *
+	 * PLATFORM_STATUS can be called in any state, if we failed to query
+	 * the PLATFORM status then either PSP firmware does not support SEV
+	 * feature or SEV firmware is dead.
+	 */
+	rc = sev_platform_status(status, NULL);
+	if (rc)
+		goto err;
+
+	pr_info("SEV supported\n");
+
+err:
+	kfree(status);
+	return rc;
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1084,6 +1122,17 @@ static __init int svm_hardware_setup(void)
 		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 
+	if (sev) {
+		if (boot_cpu_has(X86_FEATURE_SEV) &&
+		    IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
+			r = sev_hardware_setup();
+			if (r)
+				sev = false;
+		} else {
+			sev = false;
+		}
+	}
+
 	for_each_possible_cpu(cpu) {
 		r = svm_cpu_init(cpu);
 		if (r)

From dc48bae01e5a23ae67758e8fe31cdc439202b190 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:33 -0600
Subject: [PATCH 029/676] KVM: Define SEV key management command id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Define Secure Encrypted Virtualization (SEV) key management command id
and structure. The command definition is available in SEV KM spec
0.14 (http://support.amd.com/TechDocs/55766_SEV-KM API_Specification.pdf)
and Documentation/virtual/kvm/amd-memory-encryption.txt.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 .../virtual/kvm/amd-memory-encryption.rst     | 202 ++++++++++++++++++
 include/uapi/linux/kvm.h                      |  80 +++++++
 2 files changed, 282 insertions(+)

diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst
index a8ef21e737db6..71d6d257074ff 100644
--- a/Documentation/virtual/kvm/amd-memory-encryption.rst
+++ b/Documentation/virtual/kvm/amd-memory-encryption.rst
@@ -43,3 +43,205 @@ setting the SEV bit before executing VMRUN.::
 SEV hardware uses ASIDs to associate a memory encryption key with a VM.
 Hence, the ASID for the SEV-enabled guests must be from 1 to a maximum value
 defined in the CPUID 0x8000001f[ecx] field.
+
+SEV Key Management
+==================
+
+The SEV guest key management is handled by a separate processor called the AMD
+Secure Processor (AMD-SP). Firmware running inside the AMD-SP provides a secure
+key management interface to perform common hypervisor activities such as
+encrypting bootstrap code, snapshot, migrating and debugging the guest. For more
+information, see the SEV Key Management spec [api-spec]_
+
+KVM implements the following commands to support common lifecycle events of SEV
+guests, such as launching, running, snapshotting, migrating and decommissioning.
+
+1. KVM_SEV_INIT
+---------------
+
+The KVM_SEV_INIT command is used by the hypervisor to initialize the SEV platform
+context. In a typical workflow, this command should be the first command issued.
+
+Returns: 0 on success, -negative on error
+
+2. KVM_SEV_LAUNCH_START
+-----------------------
+
+The KVM_SEV_LAUNCH_START command is used for creating the memory encryption
+context. To create the encryption context, user must provide a guest policy,
+the owner's public Diffie-Hellman (PDH) key and session information.
+
+Parameters: struct  kvm_sev_launch_start (in/out)
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_start {
+                __u32 handle;           /* if zero then firmware creates a new handle */
+                __u32 policy;           /* guest's policy */
+
+                __u64 dh_uaddr;         /* userspace address pointing to the guest owner's PDH key */
+                __u32 dh_len;
+
+                __u64 session_addr;     /* userspace address which points to the guest session information */
+                __u32 session_len;
+        };
+
+On success, the 'handle' field contains a new handle and on error, a negative value.
+
+For more details, see SEV spec Section 6.2.
+
+3. KVM_SEV_LAUNCH_UPDATE_DATA
+-----------------------------
+
+The KVM_SEV_LAUNCH_UPDATE_DATA is used for encrypting a memory region. It also
+calculates a measurement of the memory contents. The measurement is a signature
+of the memory contents that can be sent to the guest owner as an attestation
+that the memory was encrypted correctly by the firmware.
+
+Parameters (in): struct  kvm_sev_launch_update_data
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_update {
+                __u64 uaddr;    /* userspace address to be encrypted (must be 16-byte aligned) */
+                __u32 len;      /* length of the data to be encrypted (must be 16-byte aligned) */
+        };
+
+For more details, see SEV spec Section 6.3.
+
+4. KVM_SEV_LAUNCH_MEASURE
+-------------------------
+
+The KVM_SEV_LAUNCH_MEASURE command is used to retrieve the measurement of the
+data encrypted by the KVM_SEV_LAUNCH_UPDATE_DATA command. The guest owner may
+wait to provide the guest with confidential information until it can verify the
+measurement. Since the guest owner knows the initial contents of the guest at
+boot, the measurement can be verified by comparing it to what the guest owner
+expects.
+
+Parameters (in): struct  kvm_sev_launch_measure
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_measure {
+                __u64 uaddr;    /* where to copy the measurement */
+                __u32 len;      /* length of measurement blob */
+        };
+
+For more details on the measurement verification flow, see SEV spec Section 6.4.
+
+5. KVM_SEV_LAUNCH_FINISH
+------------------------
+
+After completion of the launch flow, the KVM_SEV_LAUNCH_FINISH command can be
+issued to make the guest ready for the execution.
+
+Returns: 0 on success, -negative on error
+
+6. KVM_SEV_GUEST_STATUS
+-----------------------
+
+The KVM_SEV_GUEST_STATUS command is used to retrieve status information about a
+SEV-enabled guest.
+
+Parameters (out): struct kvm_sev_guest_status
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_guest_status {
+                __u32 handle;   /* guest handle */
+                __u32 policy;   /* guest policy */
+                __u8 state;     /* guest state (see enum below) */
+        };
+
+SEV guest state:
+
+::
+
+        enum {
+        SEV_STATE_INVALID = 0;
+        SEV_STATE_LAUNCHING,    /* guest is currently being launched */
+        SEV_STATE_SECRET,       /* guest is being launched and ready to accept the ciphertext data */
+        SEV_STATE_RUNNING,      /* guest is fully launched and running */
+        SEV_STATE_RECEIVING,    /* guest is being migrated in from another SEV machine */
+        SEV_STATE_SENDING       /* guest is getting migrated out to another SEV machine */
+        };
+
+7. KVM_SEV_DBG_DECRYPT
+----------------------
+
+The KVM_SEV_DEBUG_DECRYPT command can be used by the hypervisor to request the
+firmware to decrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to decrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to decrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+8. KVM_SEV_DBG_ENCRYPT
+----------------------
+
+The KVM_SEV_DEBUG_ENCRYPT command can be used by the hypervisor to request the
+firmware to encrypt the data at the given memory region.
+
+Parameters (in): struct kvm_sev_dbg
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_dbg {
+                __u64 src_uaddr;        /* userspace address of data to encrypt */
+                __u64 dst_uaddr;        /* userspace address of destination */
+                __u32 len;              /* length of memory region to encrypt */
+        };
+
+The command returns an error if the guest policy does not allow debugging.
+
+9. KVM_SEV_LAUNCH_SECRET
+------------------------
+
+The KVM_SEV_LAUNCH_SECRET command can be used by the hypervisor to inject secret
+data after the measurement has been validated by the guest owner.
+
+Parameters (in): struct kvm_sev_launch_secret
+
+Returns: 0 on success, -negative on error
+
+::
+
+        struct kvm_sev_launch_secret {
+                __u64 hdr_uaddr;        /* userspace address containing the packet header */
+                __u32 hdr_len;
+
+                __u64 guest_uaddr;      /* the guest memory region where the secret should be injected */
+                __u32 guest_len;
+
+                __u64 trans_uaddr;      /* the hypervisor memory region which contains the secret */
+                __u32 trans_len;
+        };
+
+References
+==========
+
+.. [white-paper] http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/12/AMD_Memory_Encryption_Whitepaper_v7-Public.pdf
+.. [api-spec] http://support.amd.com/TechDocs/55766_SEV-KM%20API_Specification.pdf
+.. [amd-apm] http://support.amd.com/TechDocs/24593.pdf (section 15.34)
+.. [kvm-forum]  http://www.linux-kvm.org/images/7/74/02x08A-Thomas_Lendacky-AMDs_Virtualizatoin_Memory_Encryption_Technology.pdf
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c8c65190907d6..571431d3384b4 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1369,6 +1369,86 @@ struct kvm_enc_region {
 #define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
 #define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
 
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+	/* Guest initialization commands */
+	KVM_SEV_INIT = 0,
+	KVM_SEV_ES_INIT,
+	/* Guest launch commands */
+	KVM_SEV_LAUNCH_START,
+	KVM_SEV_LAUNCH_UPDATE_DATA,
+	KVM_SEV_LAUNCH_UPDATE_VMSA,
+	KVM_SEV_LAUNCH_SECRET,
+	KVM_SEV_LAUNCH_MEASURE,
+	KVM_SEV_LAUNCH_FINISH,
+	/* Guest migration commands (outgoing) */
+	KVM_SEV_SEND_START,
+	KVM_SEV_SEND_UPDATE_DATA,
+	KVM_SEV_SEND_UPDATE_VMSA,
+	KVM_SEV_SEND_FINISH,
+	/* Guest migration commands (incoming) */
+	KVM_SEV_RECEIVE_START,
+	KVM_SEV_RECEIVE_UPDATE_DATA,
+	KVM_SEV_RECEIVE_UPDATE_VMSA,
+	KVM_SEV_RECEIVE_FINISH,
+	/* Guest status and debug commands */
+	KVM_SEV_GUEST_STATUS,
+	KVM_SEV_DBG_DECRYPT,
+	KVM_SEV_DBG_ENCRYPT,
+	/* Guest certificates commands */
+	KVM_SEV_CERT_EXPORT,
+
+	KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+	__u32 id;
+	__u64 data;
+	__u32 error;
+	__u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 dh_uaddr;
+	__u32 dh_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
+struct kvm_sev_launch_update_data {
+	__u64 uaddr;
+	__u32 len;
+};
+
+
+struct kvm_sev_launch_secret {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_sev_launch_measure {
+	__u64 uaddr;
+	__u32 len;
+};
+
+struct kvm_sev_guest_status {
+	__u32 handle;
+	__u32 policy;
+	__u32 state;
+};
+
+struct kvm_sev_dbg {
+	__u64 src_uaddr;
+	__u64 dst_uaddr;
+	__u32 len;
+};
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
 #define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)

From 1654efcbc431a369397a20bf85e45870d15c8689 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:34 -0600
Subject: [PATCH 030/676] KVM: SVM: Add KVM_SEV_INIT command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command initializes the SEV platform context and allocates a new ASID
for this guest from the SEV ASID pool. The firmware must be initialized
before we issue any guest launch commands to create a new memory encryption
context.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/kvm_host.h |   7 ++
 arch/x86/kvm/svm.c              | 132 +++++++++++++++++++++++++++++++-
 2 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 58b7cc30466b4..384dcfda43cc7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -747,6 +747,11 @@ enum kvm_irqchip_mode {
 	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+struct kvm_sev_info {
+	bool active;		/* SEV enabled guest */
+	unsigned int asid;	/* ASID used for this guest */
+};
+
 struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
@@ -834,6 +839,8 @@ struct kvm_arch {
 
 	bool x2apic_format;
 	bool x2apic_broadcast_quirk_disabled;
+
+	struct kvm_sev_info sev_info;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ca63642517a70..51186107eb22a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -38,6 +38,7 @@
 #include <linux/hashtable.h>
 #include <linux/frame.h>
 #include <linux/psp-sev.h>
+#include <linux/file.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -325,6 +326,20 @@ enum {
 #define VMCB_AVIC_APIC_BAR_MASK		0xFFFFFFFFFF000ULL
 
 static unsigned int max_sev_asid;
+static unsigned int min_sev_asid;
+static unsigned long *sev_asid_bitmap;
+
+static inline bool svm_sev_enabled(void)
+{
+	return max_sev_asid;
+}
+
+static inline bool sev_guest(struct kvm *kvm)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+	return sev->active;
+}
 
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
@@ -1065,6 +1080,15 @@ static __init int sev_hardware_setup(void)
 	if (!max_sev_asid)
 		return 1;
 
+	/* Minimum ASID value that should be used for SEV guest */
+	min_sev_asid = cpuid_edx(0x8000001F);
+
+	/* Initialize SEV ASID bitmap */
+	sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid),
+				sizeof(unsigned long), GFP_KERNEL);
+	if (!sev_asid_bitmap)
+		return 1;
+
 	status = kmalloc(sizeof(*status), GFP_KERNEL);
 	if (!status)
 		return 1;
@@ -1194,6 +1218,9 @@ static __exit void svm_hardware_unsetup(void)
 {
 	int cpu;
 
+	if (svm_sev_enabled())
+		kfree(sev_asid_bitmap);
+
 	for_each_possible_cpu(cpu)
 		svm_cpu_uninit(cpu);
 
@@ -1384,6 +1411,9 @@ static void init_vmcb(struct vcpu_svm *svm)
 		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
 	}
 
+	if (sev_guest(svm->vcpu.kvm))
+		svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+
 	mark_all_dirty(svm->vmcb);
 
 	enable_gif(svm);
@@ -1466,6 +1496,29 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void __sev_asid_free(int asid)
+{
+	int pos;
+
+	pos = asid - 1;
+	clear_bit(pos, sev_asid_bitmap);
+}
+
+static void sev_asid_free(struct kvm *kvm)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+	__sev_asid_free(sev->asid);
+}
+
+static void sev_vm_destroy(struct kvm *kvm)
+{
+	if (!sev_guest(kvm))
+		return;
+
+	sev_asid_free(kvm);
+}
+
 static void avic_vm_destroy(struct kvm *kvm)
 {
 	unsigned long flags;
@@ -1484,6 +1537,12 @@ static void avic_vm_destroy(struct kvm *kvm)
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
+static void svm_vm_destroy(struct kvm *kvm)
+{
+	avic_vm_destroy(kvm);
+	sev_vm_destroy(kvm);
+}
+
 static int avic_vm_init(struct kvm *kvm)
 {
 	unsigned long flags;
@@ -5556,6 +5615,75 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int sev_asid_new(void)
+{
+	int pos;
+
+	/*
+	 * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
+	 */
+	pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
+	if (pos >= max_sev_asid)
+		return -EBUSY;
+
+	set_bit(pos, sev_asid_bitmap);
+	return pos + 1;
+}
+
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	int asid, ret;
+
+	ret = -EBUSY;
+	asid = sev_asid_new();
+	if (asid < 0)
+		return ret;
+
+	ret = sev_platform_init(&argp->error);
+	if (ret)
+		goto e_free;
+
+	sev->active = true;
+	sev->asid = asid;
+
+	return 0;
+
+e_free:
+	__sev_asid_free(asid);
+	return ret;
+}
+
+static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
+{
+	struct kvm_sev_cmd sev_cmd;
+	int r;
+
+	if (!svm_sev_enabled())
+		return -ENOTTY;
+
+	if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
+		return -EFAULT;
+
+	mutex_lock(&kvm->lock);
+
+	switch (sev_cmd.id) {
+	case KVM_SEV_INIT:
+		r = sev_guest_init(kvm, &sev_cmd);
+		break;
+	default:
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
+		r = -EFAULT;
+
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -5572,7 +5700,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.vcpu_reset = svm_vcpu_reset,
 
 	.vm_init = avic_vm_init,
-	.vm_destroy = avic_vm_destroy,
+	.vm_destroy = svm_vm_destroy,
 
 	.prepare_guest_switch = svm_prepare_guest_switch,
 	.vcpu_load = svm_vcpu_load,
@@ -5671,6 +5799,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.pre_enter_smm = svm_pre_enter_smm,
 	.pre_leave_smm = svm_pre_leave_smm,
 	.enable_smi_window = enable_smi_window,
+
+	.mem_enc_op = svm_mem_enc_op,
 };
 
 static int __init svm_init(void)

From 70cd94e60c733e3afc18b0e6aab789c13b5571da Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:34 -0600
Subject: [PATCH 031/676] KVM: SVM: VMRUN should use associated ASID when SEV
 is enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SEV hardware uses ASIDs to associate a memory encryption key with a
guest VM. During guest creation, a SEV VM uses the SEV_CMD_ACTIVATE
command to bind a particular ASID to the guest. Lets make sure that the
VMCB is programmed with the bound ASID before a VMRUN.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 58 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 51186107eb22a..cdbdc86d7aee9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -213,6 +213,9 @@ struct vcpu_svm {
 	 */
 	struct list_head ir_list;
 	spinlock_t ir_list_lock;
+
+	/* which host CPU was used for running this vcpu */
+	unsigned int last_cpu;
 };
 
 /*
@@ -341,6 +344,13 @@ static inline bool sev_guest(struct kvm *kvm)
 	return sev->active;
 }
 
+static inline int sev_get_asid(struct kvm *kvm)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+	return sev->asid;
+}
+
 static inline void mark_all_dirty(struct vmcb *vmcb)
 {
 	vmcb->control.clean = 0;
@@ -551,6 +561,9 @@ struct svm_cpu_data {
 	struct kvm_ldttss_desc *tss_desc;
 
 	struct page *save_area;
+
+	/* index = sev_asid, value = vmcb pointer */
+	struct vmcb **sev_vmcbs;
 };
 
 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
@@ -864,6 +877,7 @@ static void svm_cpu_uninit(int cpu)
 		return;
 
 	per_cpu(svm_data, raw_smp_processor_id()) = NULL;
+	kfree(sd->sev_vmcbs);
 	__free_page(sd->save_area);
 	kfree(sd);
 }
@@ -877,11 +891,18 @@ static int svm_cpu_init(int cpu)
 	if (!sd)
 		return -ENOMEM;
 	sd->cpu = cpu;
-	sd->save_area = alloc_page(GFP_KERNEL);
 	r = -ENOMEM;
+	sd->save_area = alloc_page(GFP_KERNEL);
 	if (!sd->save_area)
 		goto err_1;
 
+	if (svm_sev_enabled()) {
+		r = -ENOMEM;
+		sd->sev_vmcbs = kmalloc((max_sev_asid + 1) * sizeof(void *), GFP_KERNEL);
+		if (!sd->sev_vmcbs)
+			goto err_1;
+	}
+
 	per_cpu(svm_data, cpu) = sd;
 
 	return 0;
@@ -1498,10 +1519,16 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
 
 static void __sev_asid_free(int asid)
 {
-	int pos;
+	struct svm_cpu_data *sd;
+	int cpu, pos;
 
 	pos = asid - 1;
 	clear_bit(pos, sev_asid_bitmap);
+
+	for_each_possible_cpu(cpu) {
+		sd = per_cpu(svm_data, cpu);
+		sd->sev_vmcbs[pos] = NULL;
+	}
 }
 
 static void sev_asid_free(struct kvm *kvm)
@@ -4466,12 +4493,39 @@ static void reload_tss(struct kvm_vcpu *vcpu)
 	load_TR_desc();
 }
 
+static void pre_sev_run(struct vcpu_svm *svm, int cpu)
+{
+	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+	int asid = sev_get_asid(svm->vcpu.kvm);
+
+	/* Assign the asid allocated with this SEV guest */
+	svm->vmcb->control.asid = asid;
+
+	/*
+	 * Flush guest TLB:
+	 *
+	 * 1) when different VMCB for the same ASID is to be run on the same host CPU.
+	 * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
+	 */
+	if (sd->sev_vmcbs[asid] == svm->vmcb &&
+	    svm->last_cpu == cpu)
+		return;
+
+	svm->last_cpu = cpu;
+	sd->sev_vmcbs[asid] = svm->vmcb;
+	svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+	mark_dirty(svm->vmcb, VMCB_ASID);
+}
+
 static void pre_svm_run(struct vcpu_svm *svm)
 {
 	int cpu = raw_smp_processor_id();
 
 	struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
 
+	if (sev_guest(svm->vcpu.kvm))
+		return pre_sev_run(svm, cpu);
+
 	/* FIXME: handle wraparound of asid_generation */
 	if (svm->asid_generation != sd->asid_generation)
 		new_asid(svm, sd);

From 59414c989220825f970f38dbcbf11f18e817d73c Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:35 -0600
Subject: [PATCH 032/676] KVM: SVM: Add support for KVM_SEV_LAUNCH_START
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The KVM_SEV_LAUNCH_START command is used to create a memory encryption
context within the SEV firmware. In order to do so, the guest owner
should provide the guest's policy, its public Diffie-Hellman (PDH) key
and session information. The command implements the LAUNCH_START flow
defined in SEV spec Section 6.2.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/svm.c              | 153 ++++++++++++++++++++++++++++++++
 2 files changed, 155 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 384dcfda43cc7..0ea890375532d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -750,6 +750,8 @@ enum kvm_irqchip_mode {
 struct kvm_sev_info {
 	bool active;		/* SEV enabled guest */
 	unsigned int asid;	/* ASID used for this guest */
+	unsigned int handle;	/* SEV firmware handle */
+	int fd;			/* SEV device fd */
 };
 
 struct kvm_arch {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cdbdc86d7aee9..e5b712e551865 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1538,11 +1538,45 @@ static void sev_asid_free(struct kvm *kvm)
 	__sev_asid_free(sev->asid);
 }
 
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
+	struct sev_data_decommission *decommission;
+	struct sev_data_deactivate *data;
+
+	if (!handle)
+		return;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return;
+
+	/* deactivate handle */
+	data->handle = handle;
+	sev_guest_deactivate(data, NULL);
+
+	wbinvd_on_all_cpus();
+	sev_guest_df_flush(NULL);
+	kfree(data);
+
+	decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+	if (!decommission)
+		return;
+
+	/* decommission handle */
+	decommission->handle = handle;
+	sev_guest_decommission(decommission, NULL);
+
+	kfree(decommission);
+}
+
 static void sev_vm_destroy(struct kvm *kvm)
 {
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
 	if (!sev_guest(kvm))
 		return;
 
+	sev_unbind_asid(kvm, sev->handle);
 	sev_asid_free(kvm);
 }
 
@@ -5708,6 +5742,122 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
+{
+	struct sev_data_activate *data;
+	int asid = sev_get_asid(kvm);
+	int ret;
+
+	wbinvd_on_all_cpus();
+
+	ret = sev_guest_df_flush(error);
+	if (ret)
+		return ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* activate ASID on the given handle */
+	data->handle = handle;
+	data->asid   = asid;
+	ret = sev_guest_activate(data, error);
+	kfree(data);
+
+	return ret;
+}
+
+static int sev_issue_cmd(int fd, int id, void *data, int *error)
+{
+	struct fd f;
+	int ret;
+
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	ret = sev_issue_cmd_external_user(f.file, id, data, error);
+
+	fdput(f);
+	return ret;
+}
+
+static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct sev_data_launch_start *start;
+	struct kvm_sev_launch_start params;
+	void *dh_blob, *session_blob;
+	int *error = &argp->error;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	start = kzalloc(sizeof(*start), GFP_KERNEL);
+	if (!start)
+		return -ENOMEM;
+
+	dh_blob = NULL;
+	if (params.dh_uaddr) {
+		dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
+		if (IS_ERR(dh_blob)) {
+			ret = PTR_ERR(dh_blob);
+			goto e_free;
+		}
+
+		start->dh_cert_address = __sme_set(__pa(dh_blob));
+		start->dh_cert_len = params.dh_len;
+	}
+
+	session_blob = NULL;
+	if (params.session_uaddr) {
+		session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
+		if (IS_ERR(session_blob)) {
+			ret = PTR_ERR(session_blob);
+			goto e_free_dh;
+		}
+
+		start->session_address = __sme_set(__pa(session_blob));
+		start->session_len = params.session_len;
+	}
+
+	start->handle = params.handle;
+	start->policy = params.policy;
+
+	/* create memory encryption context */
+	ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+	if (ret)
+		goto e_free_session;
+
+	/* Bind ASID to this guest */
+	ret = sev_bind_asid(kvm, start->handle, error);
+	if (ret)
+		goto e_free_session;
+
+	/* return handle to userspace */
+	params.handle = start->handle;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+		sev_unbind_asid(kvm, start->handle);
+		ret = -EFAULT;
+		goto e_free_session;
+	}
+
+	sev->handle = start->handle;
+	sev->fd = argp->sev_fd;
+
+e_free_session:
+	kfree(session_blob);
+e_free_dh:
+	kfree(dh_blob);
+e_free:
+	kfree(start);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -5725,6 +5875,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_INIT:
 		r = sev_guest_init(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_LAUNCH_START:
+		r = sev_launch_start(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 89c5058090528026f6542e8b12f4262e492bd3a2 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:35 -0600
Subject: [PATCH 033/676] KVM: SVM: Add support for KVM_SEV_LAUNCH_UPDATE_DATA
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used for encrypting the guest memory region using the VM
encryption key (VEK) created during KVM_SEV_LAUNCH_START.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Improvements-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/svm.c              | 191 +++++++++++++++++++++++++++++++-
 2 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0ea890375532d..a0b021f1fd050 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -752,6 +752,7 @@ struct kvm_sev_info {
 	unsigned int asid;	/* ASID used for this guest */
 	unsigned int handle;	/* SEV firmware handle */
 	int fd;			/* SEV device fd */
+	unsigned long pages_locked; /* Number of pages locked */
 };
 
 struct kvm_arch {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e5b712e551865..88951cbef3ec1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -39,6 +39,8 @@
 #include <linux/frame.h>
 #include <linux/psp-sev.h>
 #include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -331,6 +333,7 @@ enum {
 static unsigned int max_sev_asid;
 static unsigned int min_sev_asid;
 static unsigned long *sev_asid_bitmap;
+#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 
 static inline bool svm_sev_enabled(void)
 {
@@ -1569,6 +1572,83 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
 	kfree(decommission);
 }
 
+static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
+				    unsigned long ulen, unsigned long *n,
+				    int write)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	unsigned long npages, npinned, size;
+	unsigned long locked, lock_limit;
+	struct page **pages;
+	int first, last;
+
+	/* Calculate number of pages. */
+	first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+	last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+	npages = (last - first + 1);
+
+	locked = sev->pages_locked + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+		pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+		return NULL;
+	}
+
+	/* Avoid using vmalloc for smaller buffers. */
+	size = npages * sizeof(struct page *);
+	if (size > PAGE_SIZE)
+		pages = vmalloc(size);
+	else
+		pages = kmalloc(size, GFP_KERNEL);
+
+	if (!pages)
+		return NULL;
+
+	/* Pin the user virtual address. */
+	npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+	if (npinned != npages) {
+		pr_err("SEV: Failure locking %lu pages.\n", npages);
+		goto err;
+	}
+
+	*n = npages;
+	sev->pages_locked = locked;
+
+	return pages;
+
+err:
+	if (npinned > 0)
+		release_pages(pages, npinned);
+
+	kvfree(pages);
+	return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+			     unsigned long npages)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+	release_pages(pages, npages);
+	kvfree(pages);
+	sev->pages_locked -= npages;
+}
+
+static void sev_clflush_pages(struct page *pages[], unsigned long npages)
+{
+	uint8_t *page_virtual;
+	unsigned long i;
+
+	if (npages == 0 || pages == NULL)
+		return;
+
+	for (i = 0; i < npages; i++) {
+		page_virtual = kmap_atomic(pages[i]);
+		clflush_cache_range(page_virtual, PAGE_SIZE);
+		kunmap_atomic(page_virtual);
+	}
+}
+
 static void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &kvm->arch.sev_info;
@@ -5767,7 +5847,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
 	return ret;
 }
 
-static int sev_issue_cmd(int fd, int id, void *data, int *error)
+static int __sev_issue_cmd(int fd, int id, void *data, int *error)
 {
 	struct fd f;
 	int ret;
@@ -5782,6 +5862,13 @@ static int sev_issue_cmd(int fd, int id, void *data, int *error)
 	return ret;
 }
 
+static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+
+	return __sev_issue_cmd(sev->fd, id, data, error);
+}
+
 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 {
 	struct kvm_sev_info *sev = &kvm->arch.sev_info;
@@ -5829,7 +5916,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	start->policy = params.policy;
 
 	/* create memory encryption context */
-	ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
+	ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
 	if (ret)
 		goto e_free_session;
 
@@ -5858,6 +5945,103 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int get_num_contig_pages(int idx, struct page **inpages,
+				unsigned long npages)
+{
+	unsigned long paddr, next_paddr;
+	int i = idx + 1, pages = 1;
+
+	/* find the number of contiguous pages starting from idx */
+	paddr = __sme_page_pa(inpages[idx]);
+	while (i < npages) {
+		next_paddr = __sme_page_pa(inpages[i++]);
+		if ((paddr + PAGE_SIZE) == next_paddr) {
+			pages++;
+			paddr = next_paddr;
+			continue;
+		}
+		break;
+	}
+
+	return pages;
+}
+
+static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct kvm_sev_launch_update_data params;
+	struct sev_data_launch_update_data *data;
+	struct page **inpages;
+	int i, ret, pages;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	vaddr = params.uaddr;
+	size = params.len;
+	vaddr_end = vaddr + size;
+
+	/* Lock the user memory. */
+	inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
+	if (!inpages) {
+		ret = -ENOMEM;
+		goto e_free;
+	}
+
+	/*
+	 * The LAUNCH_UPDATE command will perform in-place encryption of the
+	 * memory content (i.e it will write the same memory region with C=1).
+	 * It's possible that the cache may contain the data with C=0, i.e.,
+	 * unencrypted so invalidate it first.
+	 */
+	sev_clflush_pages(inpages, npages);
+
+	for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
+		int offset, len;
+
+		/*
+		 * If the user buffer is not page-aligned, calculate the offset
+		 * within the page.
+		 */
+		offset = vaddr & (PAGE_SIZE - 1);
+
+		/* Calculate the number of pages that can be encrypted in one go. */
+		pages = get_num_contig_pages(i, inpages, npages);
+
+		len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
+
+		data->handle = sev->handle;
+		data->len = len;
+		data->address = __sme_page_pa(inpages[i]) + offset;
+		ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
+		if (ret)
+			goto e_unpin;
+
+		size -= len;
+		next_vaddr = vaddr + len;
+	}
+
+e_unpin:
+	/* content of memory is updated, mark pages dirty */
+	for (i = 0; i < npages; i++) {
+		set_page_dirty_lock(inpages[i]);
+		mark_page_accessed(inpages[i]);
+	}
+	/* unlock the user pages */
+	sev_unpin_memory(kvm, inpages, npages);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -5878,6 +6062,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_LAUNCH_START:
 		r = sev_launch_start(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_LAUNCH_UPDATE_DATA:
+		r = sev_launch_update_data(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 0d0736f76347f7e14a3e5bb7a202a75a53ce1e3b Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:36 -0600
Subject: [PATCH 034/676] KVM: SVM: Add support for KVM_SEV_LAUNCH_MEASURE
 command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used to retrieve the measurement of contents encrypted
through the KVM_SEV_LAUNCH_UPDATE_DATA command.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 88951cbef3ec1..74e010e6b5b91 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6042,6 +6042,77 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct sev_data_launch_measure *data;
+	struct kvm_sev_launch_measure params;
+	void *blob = NULL;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* User wants to query the blob length */
+	if (!params.len)
+		goto cmd;
+
+	if (params.uaddr) {
+		if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+			ret = -EINVAL;
+			goto e_free;
+		}
+
+		if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) {
+			ret = -EFAULT;
+			goto e_free;
+		}
+
+		ret = -ENOMEM;
+		blob = kmalloc(params.len, GFP_KERNEL);
+		if (!blob)
+			goto e_free;
+
+		data->address = __psp_pa(blob);
+		data->len = params.len;
+	}
+
+cmd:
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
+
+	/*
+	 * If we query the session length, FW responded with expected data.
+	 */
+	if (!params.len)
+		goto done;
+
+	if (ret)
+		goto e_free_blob;
+
+	if (blob) {
+		if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len))
+			ret = -EFAULT;
+	}
+
+done:
+	params.len = data->len;
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+		ret = -EFAULT;
+e_free_blob:
+	kfree(blob);
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -6065,6 +6136,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_LAUNCH_UPDATE_DATA:
 		r = sev_launch_update_data(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_LAUNCH_MEASURE:
+		r = sev_launch_measure(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 5bdb0e2fa45eb007a1cc630c6b520a638e60c1f6 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:36 -0600
Subject: [PATCH 035/676] KVM: SVM: Add support for SEV LAUNCH_FINISH command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used for finializing the SEV guest launch process.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 74e010e6b5b91..6a8d81311e9db 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6113,6 +6113,26 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct sev_data_launch_finish *data;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
+
+	kfree(data);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -6139,6 +6159,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_LAUNCH_MEASURE:
 		r = sev_launch_measure(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_LAUNCH_FINISH:
+		r = sev_launch_finish(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 255d9e75e2549a5bc85c3606f5a80e8a0ef4d170 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:37 -0600
Subject: [PATCH 036/676] KVM: SVM: Add support for SEV GUEST_STATUS command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used for querying the SEV guest information.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6a8d81311e9db..f5da2753790db 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6133,6 +6133,36 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct kvm_sev_guest_status params;
+	struct sev_data_guest_status *data;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
+	if (ret)
+		goto e_free;
+
+	params.policy = data->policy;
+	params.state = data->state;
+	params.handle = data->handle;
+
+	if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+		ret = -EFAULT;
+e_free:
+	kfree(data);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -6162,6 +6192,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_LAUNCH_FINISH:
 		r = sev_launch_finish(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_GUEST_STATUS:
+		r = sev_guest_status(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 24f41fb23a39bc2b6f190dcef35a5813a4bf183a Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:37 -0600
Subject: [PATCH 037/676] KVM: SVM: Add support for SEV DEBUG_DECRYPT command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used for decrypting a guest memory region for debug
purposes.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 152 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f5da2753790db..ec2b864faec2b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6163,6 +6163,155 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
 	return ret;
 }
 
+static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
+			       unsigned long dst, int size,
+			       int *error, bool enc)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct sev_data_dbg *data;
+	int ret;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->handle = sev->handle;
+	data->dst_addr = dst;
+	data->src_addr = src;
+	data->len = size;
+
+	ret = sev_issue_cmd(kvm,
+			    enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
+			    data, error);
+	kfree(data);
+	return ret;
+}
+
+static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
+			     unsigned long dst_paddr, int sz, int *err)
+{
+	int offset;
+
+	/*
+	 * Its safe to read more than we are asked, caller should ensure that
+	 * destination has enough space.
+	 */
+	src_paddr = round_down(src_paddr, 16);
+	offset = src_paddr & 15;
+	sz = round_up(sz + offset, 16);
+
+	return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
+}
+
+static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
+				  unsigned long __user dst_uaddr,
+				  unsigned long dst_paddr,
+				  int size, int *err)
+{
+	struct page *tpage = NULL;
+	int ret, offset;
+
+	/* if inputs are not 16-byte then use intermediate buffer */
+	if (!IS_ALIGNED(dst_paddr, 16) ||
+	    !IS_ALIGNED(paddr,     16) ||
+	    !IS_ALIGNED(size,      16)) {
+		tpage = (void *)alloc_page(GFP_KERNEL);
+		if (!tpage)
+			return -ENOMEM;
+
+		dst_paddr = __sme_page_pa(tpage);
+	}
+
+	ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
+	if (ret)
+		goto e_free;
+
+	if (tpage) {
+		offset = paddr & 15;
+		if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
+				 page_address(tpage) + offset, size))
+			ret = -EFAULT;
+	}
+
+e_free:
+	if (tpage)
+		__free_page(tpage);
+
+	return ret;
+}
+
+static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
+{
+	unsigned long vaddr, vaddr_end, next_vaddr;
+	unsigned long dst_vaddr, dst_vaddr_end;
+	struct page **src_p, **dst_p;
+	struct kvm_sev_dbg debug;
+	unsigned long n;
+	int ret, size;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+		return -EFAULT;
+
+	vaddr = debug.src_uaddr;
+	size = debug.len;
+	vaddr_end = vaddr + size;
+	dst_vaddr = debug.dst_uaddr;
+	dst_vaddr_end = dst_vaddr + size;
+
+	for (; vaddr < vaddr_end; vaddr = next_vaddr) {
+		int len, s_off, d_off;
+
+		/* lock userspace source and destination page */
+		src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
+		if (!src_p)
+			return -EFAULT;
+
+		dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
+		if (!dst_p) {
+			sev_unpin_memory(kvm, src_p, n);
+			return -EFAULT;
+		}
+
+		/*
+		 * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
+		 * memory content (i.e it will write the same memory region with C=1).
+		 * It's possible that the cache may contain the data with C=0, i.e.,
+		 * unencrypted so invalidate it first.
+		 */
+		sev_clflush_pages(src_p, 1);
+		sev_clflush_pages(dst_p, 1);
+
+		/*
+		 * Since user buffer may not be page aligned, calculate the
+		 * offset within the page.
+		 */
+		s_off = vaddr & ~PAGE_MASK;
+		d_off = dst_vaddr & ~PAGE_MASK;
+		len = min_t(size_t, (PAGE_SIZE - s_off), size);
+
+		ret = __sev_dbg_decrypt_user(kvm,
+					     __sme_page_pa(src_p[0]) + s_off,
+					     dst_vaddr,
+					     __sme_page_pa(dst_p[0]) + d_off,
+					     len, &argp->error);
+
+		sev_unpin_memory(kvm, src_p, 1);
+		sev_unpin_memory(kvm, dst_p, 1);
+
+		if (ret)
+			goto err;
+
+		next_vaddr = vaddr + len;
+		dst_vaddr = dst_vaddr + len;
+		size -= len;
+	}
+err:
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -6195,6 +6344,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_GUEST_STATUS:
 		r = sev_guest_status(kvm, &sev_cmd);
 		break;
+	case KVM_SEV_DBG_DECRYPT:
+		r = sev_dbg_crypt(kvm, &sev_cmd, true);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 7d1594f5d94bb9fa60dacca2390a6c58398f005a Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:37 -0600
Subject: [PATCH 038/676] KVM: SVM: Add support for SEV DEBUG_ENCRYPT command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command copies a plaintext into guest memory and encrypts it using
the VM encryption key. The command will be used for debug purposes
(e.g setting breakpoints through gdbserver)

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/kvm/svm.c | 98 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 93 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec2b864faec2b..11d4860997d96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6240,6 +6240,83 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
 	return ret;
 }
 
+static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
+				  unsigned long __user vaddr,
+				  unsigned long dst_paddr,
+				  unsigned long __user dst_vaddr,
+				  int size, int *error)
+{
+	struct page *src_tpage = NULL;
+	struct page *dst_tpage = NULL;
+	int ret, len = size;
+
+	/* If source buffer is not aligned then use an intermediate buffer */
+	if (!IS_ALIGNED(vaddr, 16)) {
+		src_tpage = alloc_page(GFP_KERNEL);
+		if (!src_tpage)
+			return -ENOMEM;
+
+		if (copy_from_user(page_address(src_tpage),
+				(void __user *)(uintptr_t)vaddr, size)) {
+			__free_page(src_tpage);
+			return -EFAULT;
+		}
+
+		paddr = __sme_page_pa(src_tpage);
+	}
+
+	/*
+	 *  If destination buffer or length is not aligned then do read-modify-write:
+	 *   - decrypt destination in an intermediate buffer
+	 *   - copy the source buffer in an intermediate buffer
+	 *   - use the intermediate buffer as source buffer
+	 */
+	if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
+		int dst_offset;
+
+		dst_tpage = alloc_page(GFP_KERNEL);
+		if (!dst_tpage) {
+			ret = -ENOMEM;
+			goto e_free;
+		}
+
+		ret = __sev_dbg_decrypt(kvm, dst_paddr,
+					__sme_page_pa(dst_tpage), size, error);
+		if (ret)
+			goto e_free;
+
+		/*
+		 *  If source is kernel buffer then use memcpy() otherwise
+		 *  copy_from_user().
+		 */
+		dst_offset = dst_paddr & 15;
+
+		if (src_tpage)
+			memcpy(page_address(dst_tpage) + dst_offset,
+			       page_address(src_tpage), size);
+		else {
+			if (copy_from_user(page_address(dst_tpage) + dst_offset,
+					   (void __user *)(uintptr_t)vaddr, size)) {
+				ret = -EFAULT;
+				goto e_free;
+			}
+		}
+
+		paddr = __sme_page_pa(dst_tpage);
+		dst_paddr = round_down(dst_paddr, 16);
+		len = round_up(size, 16);
+	}
+
+	ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
+
+e_free:
+	if (src_tpage)
+		__free_page(src_tpage);
+	if (dst_tpage)
+		__free_page(dst_tpage);
+	return ret;
+}
+
 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
 {
 	unsigned long vaddr, vaddr_end, next_vaddr;
@@ -6292,11 +6369,19 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
 		d_off = dst_vaddr & ~PAGE_MASK;
 		len = min_t(size_t, (PAGE_SIZE - s_off), size);
 
-		ret = __sev_dbg_decrypt_user(kvm,
-					     __sme_page_pa(src_p[0]) + s_off,
-					     dst_vaddr,
-					     __sme_page_pa(dst_p[0]) + d_off,
-					     len, &argp->error);
+		if (dec)
+			ret = __sev_dbg_decrypt_user(kvm,
+						     __sme_page_pa(src_p[0]) + s_off,
+						     dst_vaddr,
+						     __sme_page_pa(dst_p[0]) + d_off,
+						     len, &argp->error);
+		else
+			ret = __sev_dbg_encrypt_user(kvm,
+						     __sme_page_pa(src_p[0]) + s_off,
+						     vaddr,
+						     __sme_page_pa(dst_p[0]) + d_off,
+						     dst_vaddr,
+						     len, &argp->error);
 
 		sev_unpin_memory(kvm, src_p, 1);
 		sev_unpin_memory(kvm, dst_p, 1);
@@ -6347,6 +6432,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_DBG_DECRYPT:
 		r = sev_dbg_crypt(kvm, &sev_cmd, true);
 		break;
+	case KVM_SEV_DBG_ENCRYPT:
+		r = sev_dbg_crypt(kvm, &sev_cmd, false);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 9f5b5b950aa96b3d303d132e069f93c8bc4c9b58 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:38 -0600
Subject: [PATCH 039/676] KVM: SVM: Add support for SEV LAUNCH_SECRET command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The command is used for injecting a secret into the guest memory region.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kvm/svm.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 11d4860997d96..8a499425bf7ec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6397,6 +6397,71 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
 	return ret;
 }
 
+static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct sev_data_launch_secret *data;
+	struct kvm_sev_launch_secret params;
+	struct page **pages;
+	void *blob, *hdr;
+	unsigned long n;
+	int ret;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+		return -EFAULT;
+
+	pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
+	if (!pages)
+		return -ENOMEM;
+
+	/*
+	 * The secret must be copied into contiguous memory region, lets verify
+	 * that userspace memory pages are contiguous before we issue command.
+	 */
+	if (get_num_contig_pages(0, pages, n) != n) {
+		ret = -EINVAL;
+		goto e_unpin_memory;
+	}
+
+	ret = -ENOMEM;
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		goto e_unpin_memory;
+
+	blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
+	if (IS_ERR(blob)) {
+		ret = PTR_ERR(blob);
+		goto e_free;
+	}
+
+	data->trans_address = __psp_pa(blob);
+	data->trans_len = params.trans_len;
+
+	hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
+	if (IS_ERR(hdr)) {
+		ret = PTR_ERR(hdr);
+		goto e_free_blob;
+	}
+	data->trans_address = __psp_pa(blob);
+	data->trans_len = params.trans_len;
+
+	data->handle = sev->handle;
+	ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
+
+	kfree(hdr);
+
+e_free_blob:
+	kfree(blob);
+e_free:
+	kfree(data);
+e_unpin_memory:
+	sev_unpin_memory(kvm, pages, n);
+	return ret;
+}
+
 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 {
 	struct kvm_sev_cmd sev_cmd;
@@ -6435,6 +6500,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	case KVM_SEV_DBG_ENCRYPT:
 		r = sev_dbg_crypt(kvm, &sev_cmd, false);
 		break;
+	case KVM_SEV_LAUNCH_SECRET:
+		r = sev_launch_secret(kvm, &sev_cmd);
+		break;
 	default:
 		r = -EINVAL;
 		goto out;

From 1e80fdc09d121d8327cdf62eefbb5abadddca792 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:38 -0600
Subject: [PATCH 040/676] KVM: SVM: Pin guest memory when SEV is active
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SEV memory encryption engine uses a tweak such that two identical
plaintext pages at different location will have different ciphertext.
So swapping or moving ciphertext of two pages will not result in
plaintext being swapped. Relocating (or migrating) physical backing
pages for a SEV guest will require some additional steps. The current SEV
key management spec does not provide commands to swap or migrate (move)
ciphertext pages. For now, we pin the guest memory registered through
KVM_MEMORY_ENCRYPT_REG_REGION ioctl.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/include/asm/kvm_host.h |   1 +
 arch/x86/kvm/svm.c              | 132 ++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a0b021f1fd050..262950f9f2d95 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -753,6 +753,7 @@ struct kvm_sev_info {
 	unsigned int handle;	/* SEV firmware handle */
 	int fd;			/* SEV device fd */
 	unsigned long pages_locked; /* Number of pages locked */
+	struct list_head regions_list;  /* List of registered regions */
 };
 
 struct kvm_arch {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a499425bf7ec..bdce5b4e71ccb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -335,6 +335,14 @@ static unsigned int min_sev_asid;
 static unsigned long *sev_asid_bitmap;
 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
 
+struct enc_region {
+	struct list_head list;
+	unsigned long npages;
+	struct page **pages;
+	unsigned long uaddr;
+	unsigned long size;
+};
+
 static inline bool svm_sev_enabled(void)
 {
 	return max_sev_asid;
@@ -1649,13 +1657,46 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
 	}
 }
 
+static void __unregister_enc_region_locked(struct kvm *kvm,
+					   struct enc_region *region)
+{
+	/*
+	 * The guest may change the memory encryption attribute from C=0 -> C=1
+	 * or vice versa for this memory range. Lets make sure caches are
+	 * flushed to ensure that guest data gets written into memory with
+	 * correct C-bit.
+	 */
+	sev_clflush_pages(region->pages, region->npages);
+
+	sev_unpin_memory(kvm, region->pages, region->npages);
+	list_del(&region->list);
+	kfree(region);
+}
+
 static void sev_vm_destroy(struct kvm *kvm)
 {
 	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct list_head *head = &sev->regions_list;
+	struct list_head *pos, *q;
 
 	if (!sev_guest(kvm))
 		return;
 
+	mutex_lock(&kvm->lock);
+
+	/*
+	 * if userspace was terminated before unregistering the memory regions
+	 * then lets unpin all the registered memory.
+	 */
+	if (!list_empty(head)) {
+		list_for_each_safe(pos, q, head) {
+			__unregister_enc_region_locked(kvm,
+				list_entry(pos, struct enc_region, list));
+		}
+	}
+
+	mutex_unlock(&kvm->lock);
+
 	sev_unbind_asid(kvm, sev->handle);
 	sev_asid_free(kvm);
 }
@@ -5814,6 +5855,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
 
 	sev->active = true;
 	sev->asid = asid;
+	INIT_LIST_HEAD(&sev->regions_list);
 
 	return 0;
 
@@ -6516,6 +6558,94 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
 	return r;
 }
 
+static int svm_register_enc_region(struct kvm *kvm,
+				   struct kvm_enc_region *range)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct enc_region *region;
+	int ret = 0;
+
+	if (!sev_guest(kvm))
+		return -ENOTTY;
+
+	region = kzalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return -ENOMEM;
+
+	region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
+	if (!region->pages) {
+		ret = -ENOMEM;
+		goto e_free;
+	}
+
+	/*
+	 * The guest may change the memory encryption attribute from C=0 -> C=1
+	 * or vice versa for this memory range. Lets make sure caches are
+	 * flushed to ensure that guest data gets written into memory with
+	 * correct C-bit.
+	 */
+	sev_clflush_pages(region->pages, region->npages);
+
+	region->uaddr = range->addr;
+	region->size = range->size;
+
+	mutex_lock(&kvm->lock);
+	list_add_tail(&region->list, &sev->regions_list);
+	mutex_unlock(&kvm->lock);
+
+	return ret;
+
+e_free:
+	kfree(region);
+	return ret;
+}
+
+static struct enc_region *
+find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
+{
+	struct kvm_sev_info *sev = &kvm->arch.sev_info;
+	struct list_head *head = &sev->regions_list;
+	struct enc_region *i;
+
+	list_for_each_entry(i, head, list) {
+		if (i->uaddr == range->addr &&
+		    i->size == range->size)
+			return i;
+	}
+
+	return NULL;
+}
+
+
+static int svm_unregister_enc_region(struct kvm *kvm,
+				     struct kvm_enc_region *range)
+{
+	struct enc_region *region;
+	int ret;
+
+	mutex_lock(&kvm->lock);
+
+	if (!sev_guest(kvm)) {
+		ret = -ENOTTY;
+		goto failed;
+	}
+
+	region = find_enc_region(kvm, range);
+	if (!region) {
+		ret = -EINVAL;
+		goto failed;
+	}
+
+	__unregister_enc_region_locked(kvm, region);
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+
+failed:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -6633,6 +6763,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.enable_smi_window = enable_smi_window,
 
 	.mem_enc_op = svm_mem_enc_op,
+	.mem_enc_reg_region = svm_register_enc_region,
+	.mem_enc_unreg_region = svm_unregister_enc_region,
 };
 
 static int __init svm_init(void)

From 0ede79e1322479221fb0d0a7d2828d8ed033f060 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:39 -0600
Subject: [PATCH 041/676] KVM: SVM: Clear C-bit from the page fault address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When SEV is active, on #VMEXIT the  page fault address will contain the
C-bit. We must clear the C-bit before handling the fault.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/kvm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bdce5b4e71ccb..f350e9bac9eb0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2430,7 +2430,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 
 static int pf_interception(struct vcpu_svm *svm)
 {
-	u64 fault_address = svm->vmcb->control.exit_info_2;
+	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
 	u64 error_code = svm->vmcb->control.exit_info_1;
 
 	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
@@ -2440,7 +2440,7 @@ static int pf_interception(struct vcpu_svm *svm)
 
 static int npf_interception(struct vcpu_svm *svm)
 {
-	u64 fault_address = svm->vmcb->control.exit_info_2;
+	u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
 	u64 error_code = svm->vmcb->control.exit_info_1;
 
 	trace_kvm_page_fault(fault_address, error_code);

From 35c6f649bbb2e3f367116307273f998f0bf3e08e Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:39 -0600
Subject: [PATCH 042/676] KVM: SVM: Do not install #UD intercept when SEV is
 enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On #UD, x86_emulate_instruction() fetches the data from guest memory and
decodes the instruction bytes to assist further. When SEV is enabled, the
instruction bytes will be encrypted using the guest-specific key and the
hypervisor will no longer able to fetch the instruction bytes to assist
UD handling. By not installing intercept we let the guest receive and
handle #UD.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f350e9bac9eb0..3e848f952b4fd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1443,8 +1443,10 @@ static void init_vmcb(struct vcpu_svm *svm)
 		svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
 	}
 
-	if (sev_guest(svm->vcpu.kvm))
+	if (sev_guest(svm->vcpu.kvm)) {
 		svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
+		clr_exception_intercept(svm, UD_VECTOR);
+	}
 
 	mark_all_dirty(svm->vmcb);
 

From 00b10fe1046c4b2232097a7ffaa9238c7e479388 Mon Sep 17 00:00:00 2001
From: Brijesh Singh <brijesh.singh@amd.com>
Date: Mon, 4 Dec 2017 10:57:40 -0600
Subject: [PATCH 043/676] KVM: X86: Restart the guest when insn_len is zero and
 SEV is enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On AMD platforms, under certain conditions insn_len may be zero on #NPF.
This can happen if a guest gets a page-fault on data access but the HW
table walker is not able to read the instruction page (e.g instruction
page is not present in memory).

Typically, when insn_len is zero, x86_emulate_instruction() walks the
guest page table and fetches the instruction bytes from guest memory.
When SEV is enabled, the guest memory is encrypted with guest-specific
key hence hypervisor will not able to fetch the instruction bytes.
In those cases we simply restart the guest.

I have encountered this issue when running kernbench inside the guest.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: x86@kernel.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
---
 arch/x86/kvm/mmu.c | 10 ++++++++++
 arch/x86/kvm/svm.c |  6 ++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e5e66e5c66405..d5e5dbd0e5ad9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4950,6 +4950,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
 	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
 emulate:
+	/*
+	 * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
+	 * This can happen if a guest gets a page-fault on data access but the HW
+	 * table walker is not able to read the instruction page (e.g instruction
+	 * page is not present in memory). In those cases we simply restart the
+	 * guest.
+	 */
+	if (unlikely(insn && !insn_len))
+		return 1;
+
 	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
 	switch (er) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3e848f952b4fd..ec5df57529952 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2436,7 +2436,8 @@ static int pf_interception(struct vcpu_svm *svm)
 	u64 error_code = svm->vmcb->control.exit_info_1;
 
 	return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
-			svm->vmcb->control.insn_bytes,
+			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+			svm->vmcb->control.insn_bytes : NULL,
 			svm->vmcb->control.insn_len);
 }
 
@@ -2447,7 +2448,8 @@ static int npf_interception(struct vcpu_svm *svm)
 
 	trace_kvm_page_fault(fault_address, error_code);
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
-			svm->vmcb->control.insn_bytes,
+			static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
+			svm->vmcb->control.insn_bytes : NULL,
 			svm->vmcb->control.insn_len);
 }
 

From ae3e61e1c28338d077b704505570fa181df1e41f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Jul 2016 10:36:41 +0200
Subject: [PATCH 044/676] KVM: x86: add support for UMIP

Add the CPUID bits, make the CR4.UMIP bit not reserved anymore, and
add UMIP support for instructions that are already emulated by KVM.

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/cpuid.c            | 4 ++--
 arch/x86/kvm/emulate.c          | 8 ++++++++
 arch/x86/kvm/x86.c              | 3 +++
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5167984313282..ff79134d1d71a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -86,7 +86,7 @@
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
 			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
-			  | X86_CR4_SMAP | X86_CR4_PKE))
+			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0099e10eb0452..77fb8732b47b3 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -387,8 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
-		F(AVX512VBMI) | F(LA57) | F(PKU) |
-		0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
+		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+		F(AVX512_VPOPCNTDQ) | F(UMIP);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abe74f779f9d7..5edb252676288 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3720,6 +3720,10 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
 {
 	struct desc_ptr desc_ptr;
 
+	if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+	    ctxt->ops->cpl(ctxt) > 0)
+		return emulate_gp(ctxt, 0);
+
 	if (ctxt->mode == X86EMUL_MODE_PROT64)
 		ctxt->op_bytes = 8;
 	get(ctxt, &desc_ptr);
@@ -3779,6 +3783,10 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
+	if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+	    ctxt->ops->cpl(ctxt) > 0)
+		return emulate_gp(ctxt, 0);
+
 	if (ctxt->dst.type == OP_MEM)
 		ctxt->dst.bytes = 2;
 	ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 56d036b9ad75d..dd2e80ac49ffc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -794,6 +794,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
 		return 1;
 
+	if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;

From dd307d017b445a3af4379c7ff548cb3da5ecde31 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Jul 2016 10:35:51 +0200
Subject: [PATCH 045/676] KVM: x86: emulate sldt and str

These are needed to handle the descriptor table vmexits when emulating
UMIP.

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5edb252676288..fa703e3baa4bb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3633,17 +3633,27 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
 {
-	if (ctxt->modrm_reg > VCPU_SREG_GS)
-		return emulate_ud(ctxt);
+	if (segment > VCPU_SREG_GS &&
+	    (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) &&
+	    ctxt->ops->cpl(ctxt) > 0)
+		return emulate_gp(ctxt, 0);
 
-	ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+	ctxt->dst.val = get_segment_selector(ctxt, segment);
 	if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
 		ctxt->dst.bytes = 2;
 	return X86EMUL_CONTINUE;
 }
 
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->modrm_reg > VCPU_SREG_GS)
+		return emulate_ud(ctxt);
+
+	return em_store_sreg(ctxt, ctxt->modrm_reg);
+}
+
 static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 {
 	u16 sel = ctxt->src.val;
@@ -3659,6 +3669,11 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
 	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
 }
 
+static int em_sldt(struct x86_emulate_ctxt *ctxt)
+{
+	return em_store_sreg(ctxt, VCPU_SREG_LDTR);
+}
+
 static int em_lldt(struct x86_emulate_ctxt *ctxt)
 {
 	u16 sel = ctxt->src.val;
@@ -3668,6 +3683,11 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt)
 	return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
 }
 
+static int em_str(struct x86_emulate_ctxt *ctxt)
+{
+	return em_store_sreg(ctxt, VCPU_SREG_TR);
+}
+
 static int em_ltr(struct x86_emulate_ctxt *ctxt)
 {
 	u16 sel = ctxt->src.val;
@@ -4372,8 +4392,8 @@ static const struct opcode group5[] = {
 };
 
 static const struct opcode group6[] = {
-	DI(Prot | DstMem,	sldt),
-	DI(Prot | DstMem,	str),
+	II(Prot | DstMem,	   em_sldt, sldt),
+	II(Prot | DstMem,	   em_str, str),
 	II(Prot | Priv | SrcMem16, em_lldt, lldt),
 	II(Prot | Priv | SrcMem16, em_ltr, ltr),
 	N, N, N, N,

From 66336cab3531d3325ebde36a04725dddd0c42cb5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Jul 2016 10:36:41 +0200
Subject: [PATCH 046/676] KVM: x86: add support for emulating UMIP

The User-Mode Instruction Prevention feature present in recent Intel
processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and
str) from being executed with CPL > 0. Otherwise, a general protection
fault is issued.

UMIP instructions in general are also able to trigger vmexits, so we can
actually emulate UMIP on older processors.  This commit sets up the
infrastructure so that kvm-intel.ko and kvm-amd.ko can set the UMIP
feature bit for CPUID even if the feature is not actually available
in hardware.

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/cpuid.c            | 2 ++
 arch/x86/kvm/svm.c              | 6 ++++++
 arch/x86/kvm/vmx.c              | 6 ++++++
 4 files changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ff79134d1d71a..515db75081d13 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1017,6 +1017,7 @@ struct kvm_x86_ops {
 	void (*handle_external_intr)(struct kvm_vcpu *vcpu);
 	bool (*mpx_supported)(void);
 	bool (*xsaves_supported)(void);
+	bool (*umip_emulated)(void);
 
 	int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 77fb8732b47b3..2b3b06458f6f6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -327,6 +327,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
 	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
 	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
+	unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
 
 	/* cpuid 1.edx */
 	const u32 kvm_cpuid_1_edx_x86_features =
@@ -473,6 +474,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			entry->ebx |= F(TSC_ADJUST);
 			entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
 			cpuid_mask(&entry->ecx, CPUID_7_ECX);
+			entry->ecx |= f_umip;
 			/* PKU is not yet implemented for shadow paging. */
 			if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
 				entry->ecx &= ~F(PKU);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index eb714f1cdf7ee..1f3e7f210374a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5204,6 +5204,11 @@ static bool svm_xsaves_supported(void)
 	return false;
 }
 
+static bool svm_umip_emulated(void)
+{
+	return false;
+}
+
 static bool svm_has_wbinvd_exit(void)
 {
 	return true;
@@ -5597,6 +5602,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.invpcid_supported = svm_invpcid_supported,
 	.mpx_supported = svm_mpx_supported,
 	.xsaves_supported = svm_xsaves_supported,
+	.umip_emulated = svm_umip_emulated,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eba631c4dbd5..b989cfe412b13 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9155,6 +9155,11 @@ static bool vmx_xsaves_supported(void)
 		SECONDARY_EXEC_XSAVES;
 }
 
+static bool vmx_umip_emulated(void)
+{
+	return false;
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -12170,6 +12175,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 	.handle_external_intr = vmx_handle_external_intr,
 	.mpx_supported = vmx_mpx_supported,
 	.xsaves_supported = vmx_xsaves_supported,
+	.umip_emulated = vmx_umip_emulated,
 
 	.check_nested_events = vmx_check_nested_events,
 

From 0367f205a3b7c0efe774634eef1f4697c79a4132 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Jul 2016 10:44:55 +0200
Subject: [PATCH 047/676] KVM: vmx: add support for emulating UMIP

UMIP can be emulated almost perfectly on Intel processor by enabling
descriptor-table exits.  SMSW does not cause a vmexit and hence it
cannot be changed into a #GP fault, but all in all it's the most
"innocuous" of the unprivileged instructions that UMIP blocks.

In fact, Linux is _also_ emulating SMSW instructions on behalf of the
program that executes them, because some 16-bit programs expect to use
SMSW to detect vm86 mode, so this is an even smaller issue.

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b989cfe412b13..ebd50de981ee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3662,6 +3662,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_ENABLE_EPT |
 			SECONDARY_EXEC_UNRESTRICTED_GUEST |
 			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
+			SECONDARY_EXEC_DESC |
 			SECONDARY_EXEC_RDTSCP |
 			SECONDARY_EXEC_ENABLE_INVPCID |
 			SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -4369,6 +4370,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		(to_vmx(vcpu)->rmode.vm86_active ?
 		 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+	if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
+		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+			      SECONDARY_EXEC_DESC);
+		hw_cr4 &= ~X86_CR4_UMIP;
+	} else
+		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+				SECONDARY_EXEC_DESC);
+
 	if (cr4 & X86_CR4_VMXE) {
 		/*
 		 * To use VMXON (and later other VMX instructions), a guest
@@ -5308,6 +5317,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 	struct kvm_vcpu *vcpu = &vmx->vcpu;
 
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+
 	if (!cpu_need_virtualize_apic_accesses(vcpu))
 		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 	if (vmx->vpid == 0)
@@ -5326,6 +5336,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+
+	/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
+	 * in vmx_set_cr4.  */
+	exec_control &= ~SECONDARY_EXEC_DESC;
+
 	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
 	   (handle_vmptrld).
 	   We can NOT enable shadow_vmcs here because we don't have yet
@@ -6101,6 +6116,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 		return kvm_set_cr4(vcpu, val);
 }
 
+static int handle_desc(struct kvm_vcpu *vcpu)
+{
+	WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
+	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -8193,6 +8214,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
+	[EXIT_REASON_GDTR_IDTR]		      = handle_desc,
+	[EXIT_REASON_LDTR_TR]		      = handle_desc,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
 	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
@@ -9157,7 +9180,8 @@ static bool vmx_xsaves_supported(void)
 
 static bool vmx_umip_emulated(void)
 {
-	return false;
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_DESC;
 }
 
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
@@ -9755,7 +9779,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
 	u32 mask =
 		SECONDARY_EXEC_SHADOW_VMCS |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		SECONDARY_EXEC_DESC;
 
 	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 

From fb6d4d340e0532032c808a9933eaaa7b8de435ab Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Jul 2016 11:04:26 +0200
Subject: [PATCH 048/676] KVM: x86: emulate RDPID

This is encoded as F3 0F C7 /7 with a register argument.  The register
argument is the second array in the group9 GroupDual, while F3 is the
fourth element of a Prefix.

Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c   |  7 ++++++-
 arch/x86/kvm/emulate.c | 22 +++++++++++++++++++++-
 arch/x86/kvm/vmx.c     | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2b3b06458f6f6..b94371146533f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -293,13 +293,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
 {
 	switch (func) {
 	case 0:
-		entry->eax = 1;		/* only one leaf currently */
+		entry->eax = 7;
 		++*nent;
 		break;
 	case 1:
 		entry->ecx = F(MOVBE);
 		++*nent;
 		break;
+	case 7:
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		if (index == 0)
+			entry->ecx = F(RDPID);
+		++*nent;
 	default:
 		break;
 	}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fa703e3baa4bb..cb929d0bb1bd1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3514,6 +3514,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+	u64 tsc_aux = 0;
+
+	if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+		return emulate_gp(ctxt, 0);
+	ctxt->dst.val = tsc_aux;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
 	u64 tsc = 0;
@@ -4424,10 +4434,20 @@ static const struct opcode group8[] = {
 	F(DstMem | SrcImmByte | Lock | PageTable,	em_btc),
 };
 
+/*
+ * The "memory" destination is actually always a register, since we come
+ * from the register case of group9.
+ */
+static const struct gprefix pfx_0f_c7_7 = {
+	N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp),
+};
+
+
 static const struct group_dual group9 = { {
 	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
-	N, N, N, N, N, N, N, N,
+	N, N, N, N, N, N, N,
+	GP(0, &pfx_0f_c7_7),
 } };
 
 static const struct opcode group11[] = {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ebd50de981ee6..8dafc5d7d4d66 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11687,6 +11687,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)
 {
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	/*
+	 * RDPID causes #UD if disabled through secondary execution controls.
+	 * Because it is marked as EmulateOnUD, we need to intercept it here.
+	 */
+	if (info->intercept == x86_intercept_rdtscp &&
+	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+		ctxt->exception.vector = UD_VECTOR;
+		ctxt->exception.error_code_valid = false;
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	/* TODO: check more intercepts... */
 	return X86EMUL_CONTINUE;
 }
 

From 858ac87fc2ff7be8fedfa8dd2ba47627068c2eae Mon Sep 17 00:00:00 2001
From: Gimcuan Hui <gimcuan@gmail.com>
Date: Fri, 3 Nov 2017 18:52:46 +0000
Subject: [PATCH 049/676] x86: kvm: mmu: make kvm_mmu_clear_all_pte_masks
 static

The kvm_mmu_clear_all_pte_masks interface is only used by kvm_mmu_module_init
locally, and does not need to be called by other module, make it static.

This patch cleans up sparse warning:
symbol 'kvm_mmu_clear_all_pte_masks' was not declared. Should it be static?

Signed-off-by: Gimcuan Hui <gimcuan@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c4deb1f34faa6..89da688784fa7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -381,7 +381,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-void kvm_mmu_clear_all_pte_masks(void)
+static void kvm_mmu_clear_all_pte_masks(void)
 {
 	shadow_user_mask = 0;
 	shadow_accessed_mask = 0;

From 0d37d26f11bc1c0085d0c89ba603e3bb2c7f7076 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 30 Nov 2017 12:04:22 +0000
Subject: [PATCH 050/676] KVM: x86: MMU: make array audit_point_name static

The array audit_point_name is local to the source and does not need to
be in global scope, so make it static.

Cleans up sparse warning:
arch/x86/kvm/mmu_audit.c:22:12: warning: symbol 'audit_point_name' was
not declared. Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu_audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index d22ddbdf5e6ed..1272861e77b9e 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,7 +19,7 @@
 
 #include <linux/ratelimit.h>
 
-char const *audit_point_name[] = {
+static char const *audit_point_name[] = {
 	"pre page fault",
 	"post page fault",
 	"pre pte write",

From 2a140f3b6e23a309453b6f68709a50ece543f0f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 29 Nov 2017 22:23:41 +0100
Subject: [PATCH 051/676] KVM: x86: prevent MWAIT in guest with buggy MONITOR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bug prevents MWAIT from waking up after a write to the monitored
cache line.
KVM might emulate a CPU model that shouldn't have the bug, so the guest
would not employ a workaround and possibly miss wakeups.
Better to avoid the situation.

Reviewed-by: Alexander Graf <agraf@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d0b95b7a90b4e..81f5f50794f66 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -281,6 +281,9 @@ static inline bool kvm_mwait_in_guest(void)
 		return false;
 	}
 
+	if (boot_cpu_has_bug(X86_BUG_MONITOR))
+		return false;
+
 	/*
 	 * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
 	 * they would allow guest to stop the CPU completely by disabling

From 346f48fa311ceee7478b5b810ba050a4a22e0b2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 29 Nov 2017 22:23:42 +0100
Subject: [PATCH 052/676] KVM: x86: drop bogus MWAIT check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check was added in some iteration while trying to fix a reported OS
X on Core 2 bug, but that bug is elsewhere.

The comment is misleading because the guest can call MWAIT with ECX = 0
even if we enforce CPUID5_ECX_INTERRUPT_BREAK;  the call would have the
exactly the same effect as if the host didn't have the feature.

A problem is that a QEMU feature exposes CPUID5_ECX_INTERRUPT_BREAK on
CPUs that do not support it.  Removing the check changes behavior on
last Pentium 4 lines (Presler, Dempsey, and Tulsa, which had VMX and
MONITOR while missing INTERRUPT_BREAK) when running a guest OS that uses
MWAIT without checking for its presence (QEMU doesn't expose MONITOR).

The only known OS that ignores the MONITOR flag is old Mac OS X and we
allowed it to bug on Core 2 (MWAIT used to throw #UD and only that OS
noticed), so we can save another 20 lines letting it bug on even older
CPUs.  Alternatively, we can return MWAIT exiting by default and let
userspace toggle it.

Reviewed-by: Alexander Graf <agraf@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.h | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 81f5f50794f66..d15859ec5e923 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -265,8 +265,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 
 static inline bool kvm_mwait_in_guest(void)
 {
-	unsigned int eax, ebx, ecx, edx;
-
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
 		return false;
 
@@ -275,29 +273,10 @@ static inline bool kvm_mwait_in_guest(void)
 		/* All AMD CPUs have a working MWAIT implementation */
 		return true;
 	case X86_VENDOR_INTEL:
-		/* Handle Intel below */
-		break;
+		return !boot_cpu_has_bug(X86_BUG_MONITOR);
 	default:
 		return false;
 	}
-
-	if (boot_cpu_has_bug(X86_BUG_MONITOR))
-		return false;
-
-	/*
-	 * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as
-	 * they would allow guest to stop the CPU completely by disabling
-	 * interrupts then invoking MWAIT.
-	 */
-	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
-		return false;
-
-	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
-
-	if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK))
-		return false;
-
-	return true;
 }
 
 #endif

From 431f5d4443964bcc58ac2e4c6fd8ace6dcd3a8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Wed, 29 Nov 2017 22:23:43 +0100
Subject: [PATCH 053/676] KVM: x86: simplify kvm_mwait_in_guest()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If Intel/AMD implements MWAIT, we expect that it works well and only
reject known bugs;  no reason to do it the other way around for minor
vendors.  (Not that they are relevant ATM.)

This allows further simplification of kvm_mwait_in_guest().
And use boot_cpu_has() instead of "cpu_has(&boot_cpu_data," while at it.

Reviewed-by: Alexander Graf <agraf@suse.de>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.h | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d15859ec5e923..c69f973111cb2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -265,18 +265,8 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
 
 static inline bool kvm_mwait_in_guest(void)
 {
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT))
-		return false;
-
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_AMD:
-		/* All AMD CPUs have a working MWAIT implementation */
-		return true;
-	case X86_VENDOR_INTEL:
-		return !boot_cpu_has_bug(X86_BUG_MONITOR);
-	default:
-		return false;
-	}
+	return boot_cpu_has(X86_FEATURE_MWAIT) &&
+		!boot_cpu_has_bug(X86_BUG_MONITOR);
 }
 
 #endif

From 52797bf9a875c4a30f846196386684e646e08a91 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Wed, 15 Nov 2017 13:43:14 +0200
Subject: [PATCH 054/676] KVM: x86: Add emulation of MSR_SMI_COUNT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This MSR returns the number of #SMIs that occurred on CPU since
boot.

It was seen to be used frequently by ESXi guest.

Patch adds a new vcpu-arch specific var called smi_count to
save the number of #SMIs which occurred on CPU since boot.
It is exposed as a read-only MSR to guest (causing #GP
on wrmsr) in RDMSR/WRMSR emulation code.
MSR_SMI_COUNT is also added to emulated_msrs[] to make sure
user-space can save/restore it for migration purposes.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Bhavesh Davda <bhavesh.davda@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 515db75081d13..340e3604dcc77 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -504,6 +504,7 @@ struct kvm_vcpu_arch {
 	int mp_state;
 	u64 ia32_misc_enable_msr;
 	u64 smbase;
+	u64 smi_count;
 	bool tpr_access_reporting;
 	u64 ia32_xss;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd2e80ac49ffc..7ea9a57b0684a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1039,6 +1039,7 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MCG_CTL,
 	MSR_IA32_MCG_EXT_CTL,
 	MSR_IA32_SMBASE,
+	MSR_SMI_COUNT,
 	MSR_PLATFORM_INFO,
 	MSR_MISC_FEATURES_ENABLES,
 };
@@ -2231,6 +2232,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.smbase = data;
 		break;
+	case MSR_SMI_COUNT:
+		if (!msr_info->host_initiated)
+			return 1;
+		vcpu->arch.smi_count = data;
+		break;
 	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
@@ -2505,6 +2511,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		msr_info->data = vcpu->arch.smbase;
 		break;
+	case MSR_SMI_COUNT:
+		msr_info->data = vcpu->arch.smi_count;
+		break;
 	case MSR_IA32_PERF_STATUS:
 		/* TSC increment by tick */
 		msr_info->data = 1000ULL;
@@ -6451,6 +6460,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 		kvm_x86_ops->queue_exception(vcpu);
 	} else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
 		vcpu->arch.smi_pending = false;
+		++vcpu->arch.smi_count;
 		enter_smm(vcpu);
 	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
 		--vcpu->arch.nmi_pending;
@@ -7808,6 +7818,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.hflags = 0;
 
 	vcpu->arch.smi_pending = 0;
+	vcpu->arch.smi_count = 0;
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;

From 78588335065967993618106c664fe62d2e271435 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 21 Nov 2017 13:40:17 +0100
Subject: [PATCH 055/676] kvm_main: Use common error handling code in
 kvm_dev_ioctl_create_vm()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a jump target so that a bit of exception handling can be better reused
at the end of this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/kvm_main.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 210bf820385a7..bc092e3d1d731 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3186,21 +3186,18 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 		return PTR_ERR(kvm);
 #ifdef CONFIG_KVM_MMIO
 	r = kvm_coalesced_mmio_init(kvm);
-	if (r < 0) {
-		kvm_put_kvm(kvm);
-		return r;
-	}
+	if (r < 0)
+		goto put_kvm;
 #endif
 	r = get_unused_fd_flags(O_CLOEXEC);
-	if (r < 0) {
-		kvm_put_kvm(kvm);
-		return r;
-	}
+	if (r < 0)
+		goto put_kvm;
+
 	file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (IS_ERR(file)) {
 		put_unused_fd(r);
-		kvm_put_kvm(kvm);
-		return PTR_ERR(file);
+		r = PTR_ERR(file);
+		goto put_kvm;
 	}
 
 	/*
@@ -3218,6 +3215,10 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 
 	fd_install(r, file);
 	return r;
+
+put_kvm:
+	kvm_put_kvm(kvm);
+	return r;
 }
 
 static long kvm_dev_ioctl(struct file *filp,

From 80fef315a74d79d765dbf58d9481843a364c50d6 Mon Sep 17 00:00:00 2001
From: Yang Zhong <yang.zhong@intel.com>
Date: Wed, 22 Nov 2017 15:27:29 +0800
Subject: [PATCH 056/676] KVM: Expose new cpu features to guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel IceLake cpu has added new cpu features,AVX512_VBMI2/GFNI/
VAES/VPCLMULQDQ/AVX512_VNNI/AVX512_BITALG. Those new cpu features
need expose to guest VM.

The bit definition:
CPUID.(EAX=7,ECX=0):ECX[bit 06] AVX512_VBMI2
CPUID.(EAX=7,ECX=0):ECX[bit 08] GFNI
CPUID.(EAX=7,ECX=0):ECX[bit 09] VAES
CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ
CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI
CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG

The release document ref below link:
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

The kernel dependency commit in kvm.git:
(c128dbfa0f879f8ce7b79054037889b0b2240728)

Signed-off-by: Yang Zhong <yang.zhong@intel.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b94371146533f..3db9e1cd4f63b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	/* cpuid 7.0.ecx*/
 	const u32 kvm_cpuid_7_0_ecx_x86_features =
 		F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
-		F(AVX512_VPOPCNTDQ) | F(UMIP);
+		F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+		F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG);
 
 	/* cpuid 7.0.edx*/
 	const u32 kvm_cpuid_7_0_edx_x86_features =

From 00647b44944a2f7212ac2c3825a465153d6e438f Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Mon, 27 Nov 2017 17:22:25 -0600
Subject: [PATCH 057/676] KVM: nVMX: Eliminate vmcs02 pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Reviewed-by: Ameya More <ameya.more@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 146 +++++++--------------------------------------
 1 file changed, 23 insertions(+), 123 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8dafc5d7d4d66..d1870f3c8c699 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -184,7 +184,6 @@ module_param(ple_window_max, int, S_IRUGO);
 extern const ulong vmx_return;
 
 #define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
 	u32 revision_id;
@@ -225,7 +224,7 @@ struct shared_msr_entry {
  * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
  * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
  * More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
  * underlying hardware which will be used to run L2.
  * This structure is packed to ensure that its layout is identical across
  * machines (necessary for live migration).
@@ -408,13 +407,6 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
-	struct list_head list;
-	gpa_t vmptr;
-	struct loaded_vmcs vmcs02;
-};
-
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -439,15 +431,15 @@ struct nested_vmx {
 	 */
 	bool sync_shadow_vmcs;
 
-	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
-	struct list_head vmcs02_pool;
-	int vmcs02_num;
 	bool change_vmcs01_virtual_x2apic_mode;
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
+
+	struct loaded_vmcs vmcs02;
+
 	/*
-	 * Guest pages referred to in vmcs02 with host-physical pointers, so
-	 * we must keep them pinned while L2 runs.
+	 * Guest pages referred to in the vmcs02 with host-physical
+	 * pointers, so we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
 	struct page *virtual_apic_page;
@@ -6988,94 +6980,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 	return handle_nop(vcpu);
 }
 
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-	struct vmcs02_list *item;
-	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-		if (item->vmptr == vmx->nested.current_vmptr) {
-			list_move(&item->list, &vmx->nested.vmcs02_pool);
-			return &item->vmcs02;
-		}
-
-	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-		/* Recycle the least recently used VMCS. */
-		item = list_last_entry(&vmx->nested.vmcs02_pool,
-				       struct vmcs02_list, list);
-		item->vmptr = vmx->nested.current_vmptr;
-		list_move(&item->list, &vmx->nested.vmcs02_pool);
-		return &item->vmcs02;
-	}
-
-	/* Create a new VMCS */
-	item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-	if (!item)
-		return NULL;
-	item->vmcs02.vmcs = alloc_vmcs();
-	item->vmcs02.shadow_vmcs = NULL;
-	if (!item->vmcs02.vmcs) {
-		kfree(item);
-		return NULL;
-	}
-	loaded_vmcs_init(&item->vmcs02);
-	item->vmptr = vmx->nested.current_vmptr;
-	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num++;
-	return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
-	struct vmcs02_list *item;
-	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-		if (item->vmptr == vmptr) {
-			free_loaded_vmcs(&item->vmcs02);
-			list_del(&item->list);
-			kfree(item);
-			vmx->nested.vmcs02_num--;
-			return;
-		}
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
-	struct vmcs02_list *item, *n;
-
-	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
-	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-		/*
-		 * Something will leak if the above WARN triggers.  Better than
-		 * a use-after-free.
-		 */
-		if (vmx->loaded_vmcs == &item->vmcs02)
-			continue;
-
-		free_loaded_vmcs(&item->vmcs02);
-		list_del(&item->list);
-		kfree(item);
-		vmx->nested.vmcs02_num--;
-	}
-}
-
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
  * set the success or error code of an emulated VMX instruction, as specified
@@ -7257,6 +7161,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs *shadow_vmcs;
 
+	vmx->nested.vmcs02.vmcs = alloc_vmcs();
+	vmx->nested.vmcs02.shadow_vmcs = NULL;
+	if (!vmx->nested.vmcs02.vmcs)
+		goto out_vmcs02;
+	loaded_vmcs_init(&vmx->nested.vmcs02);
+
 	if (cpu_has_vmx_msr_bitmap()) {
 		vmx->nested.msr_bitmap =
 				(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7279,9 +7189,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
 	}
 
-	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-	vmx->nested.vmcs02_num = 0;
-
 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL_PINNED);
 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7296,6 +7203,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vmx->nested.msr_bitmap);
 
 out_msr_bitmap:
+	free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
 	return -ENOMEM;
 }
 
@@ -7449,7 +7359,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->vmcs01.shadow_vmcs = NULL;
 	}
 	kfree(vmx->nested.cached_vmcs12);
-	/* Unpin physical memory we referred to in current vmcs02 */
+	/* Unpin physical memory we referred to in the vmcs02 */
 	if (vmx->nested.apic_access_page) {
 		kvm_release_page_dirty(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
@@ -7465,7 +7375,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.pi_desc = NULL;
 	}
 
-	nested_free_all_saved_vmcss(vmx);
+	free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
 /* Emulate the VMXOFF instruction */
@@ -7508,8 +7418,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
 			vmptr + offsetof(struct vmcs12, launch_state),
 			&zero, sizeof(zero));
 
-	nested_free_vmcs02(vmx, vmptr);
-
 	nested_vmx_succeed(vcpu);
 	return kvm_skip_emulated_instruction(vcpu);
 }
@@ -8423,10 +8331,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 
 	/*
 	 * The host physical addresses of some pages of guest memory
-	 * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
-	 * may write to these pages via their host physical address while
-	 * L2 is running, bypassing any address-translation-based dirty
-	 * tracking (e.g. EPT write protection).
+	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+	 * Page). The CPU may write to these pages via their host
+	 * physical address while L2 is running, bypassing any
+	 * address-translation-based dirty tracking (e.g. EPT write
+	 * protection).
 	 *
 	 * Mark them dirty on every exit from L2 to prevent them from
 	 * getting out of sync with dirty tracking.
@@ -10912,20 +10821,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	struct loaded_vmcs *vmcs02;
 	u32 msr_entry_idx;
 	u32 exit_qual;
 
-	vmcs02 = nested_get_current_vmcs02(vmx);
-	if (!vmcs02)
-		return -ENOMEM;
-
 	enter_guest_mode(vcpu);
 
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
-	vmx_switch_vmcs(vcpu, vmcs02);
+	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
 	vmx_segment_cache_clear(vmx);
 
 	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11543,10 +11447,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	vm_exit_controls_reset_shadow(vmx);
 	vmx_segment_cache_clear(vmx);
 
-	/* if no vmcs02 cache requested, remove the one we used */
-	if (VMCS02_POOL_SIZE == 0)
-		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
 	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);

From 276c796cfef5bdaf9aae055f520b8857eaa3fa19 Mon Sep 17 00:00:00 2001
From: Mark Kanda <mark.kanda@oracle.com>
Date: Mon, 27 Nov 2017 17:22:26 -0600
Subject: [PATCH 058/676] KVM: nVMX: Add a WARN for freeing a loaded VMCS02
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When attempting to free a loaded VMCS02, add a WARN and avoid
freeing it (to avoid use-after-free situations).

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Reviewed-by: Ameya More <ameya.more@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d1870f3c8c699..3b5f702854140 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3846,6 +3846,19 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
 }
 
+static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx)
+{
+	struct loaded_vmcs *loaded_vmcs = &vmx->nested.vmcs02;
+
+	/*
+	 * Just leak the VMCS02 if the WARN triggers. Better than
+	 * a use-after-free.
+	 */
+	if (WARN_ON(vmx->loaded_vmcs == loaded_vmcs))
+		return;
+	free_loaded_vmcs(loaded_vmcs);
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -7203,7 +7216,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vmx->nested.msr_bitmap);
 
 out_msr_bitmap:
-	free_loaded_vmcs(&vmx->nested.vmcs02);
+	vmx_nested_free_vmcs02(vmx);
 
 out_vmcs02:
 	return -ENOMEM;
@@ -7375,7 +7388,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		vmx->nested.pi_desc = NULL;
 	}
 
-	free_loaded_vmcs(&vmx->nested.vmcs02);
+	vmx_nested_free_vmcs02(vmx);
 }
 
 /* Emulate the VMXOFF instruction */

From 74c55931c71352317ae0f5736ee9e4ca07ba4238 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 29 Nov 2017 01:31:20 -0800
Subject: [PATCH 059/676] KVM: VMX: Cache IA32_DEBUGCTL in memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MSR_IA32_DEBUGCTLMSR is zeroed on VMEXIT, so it is saved/restored each
time during world switch.  This patch caches the host IA32_DEBUGCTL MSR
and saves/restores the host IA32_DEBUGCTL msr when guest/host switches
to avoid to save/restore each time during world switch.  This saves
about 100 clock cycles per vmexit.

Suggested-by: Jim Mattson <jmattson@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3b5f702854140..4e7da90b44260 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -650,6 +650,8 @@ struct vcpu_vmx {
 
 	u32 host_pkru;
 
+	unsigned long host_debugctlmsr;
+
 	/*
 	 * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
 	 * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
@@ -2318,6 +2320,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	vmx_vcpu_pi_load(vcpu, cpu);
 	vmx->host_pkru = read_pkru();
+	vmx->host_debugctlmsr = get_debugctlmsr();
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -9261,7 +9264,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long debugctlmsr, cr3, cr4;
+	unsigned long cr3, cr4;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!enable_vnmi &&
@@ -9314,7 +9317,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		__write_pkru(vcpu->arch.pkru);
 
 	atomic_switch_perf_msrs(vmx);
-	debugctlmsr = get_debugctlmsr();
 
 	vmx_arm_hv_timer(vcpu);
 
@@ -9425,8 +9427,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	      );
 
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
-	if (debugctlmsr)
-		update_debugctlmsr(debugctlmsr);
+	if (vmx->host_debugctlmsr)
+		update_debugctlmsr(vmx->host_debugctlmsr);
 
 #ifndef CONFIG_X86_64
 	/*

From 9c48d517ce6da398b8cff0603b75b366759023c4 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <kernellwp@gmail.com>
Date: Fri, 1 Dec 2017 00:15:10 -0800
Subject: [PATCH 060/676] KVM: X86: Reduce the overhead when
 lapic_timer_advance is disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When I run ebizzy in a 32 vCPUs guest on a 32 pCPUs Xeon box, I can observe
~8000 kvm_wait_lapic_expire CurAvg/s through kvm_stat tool even if the advance
tscdeadline hrtimer expiration is disabled. Each call to wait_lapic_expire()
will consume ~70 cycles when a timer fires since apic_timer_expire() will
set expired_tscdeadline and then wait_lapic_expire() will do some caculation
before bailing out. So total ~175us per second is lost on this 3.2Ghz machine.
This patch reduces the overhead by skipping the function wait_lapic_expire()
when lapic_timer_advance is disabled.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ea9a57b0684a..54d66f238e203 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7018,7 +7018,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
-	wait_lapic_expire(vcpu);
+	if (lapic_timer_advance_ns)
+		wait_lapic_expire(vcpu);
 	guest_enter_irqoff();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {

From 8eb73e2d410f00d383023fe41c0c25c6195b7389 Mon Sep 17 00:00:00 2001
From: Quan Xu <quan.xu0@gmail.com>
Date: Tue, 12 Dec 2017 16:44:21 +0800
Subject: [PATCH 061/676] KVM: VMX: drop I/O permission bitmaps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since KVM removes the only I/O port 0x80 bypass on Intel hosts,
clear CPU_BASED_USE_IO_BITMAPS and set CPU_BASED_UNCOND_IO_EXITING
bit. Then these I/O permission bitmaps are not used at all, so
drop I/O permission bitmaps.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Radim KrÄmÃ¡Å™ <rkrcmar@redhat.com>
Signed-off-by: Quan Xu <quan.xu0@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4e7da90b44260..b016cf1b9f773 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -937,8 +937,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
 
 enum {
-	VMX_IO_BITMAP_A,
-	VMX_IO_BITMAP_B,
 	VMX_MSR_BITMAP_LEGACY,
 	VMX_MSR_BITMAP_LONGMODE,
 	VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
@@ -952,8 +950,6 @@ enum {
 
 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
 
-#define vmx_io_bitmap_a                      (vmx_bitmap[VMX_IO_BITMAP_A])
-#define vmx_io_bitmap_b                      (vmx_bitmap[VMX_IO_BITMAP_B])
 #define vmx_msr_bitmap_legacy                (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
 #define vmx_msr_bitmap_longmode              (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
 #define vmx_msr_bitmap_legacy_x2apic_apicv   (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
@@ -3627,7 +3623,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #endif
 	      CPU_BASED_CR3_LOAD_EXITING |
 	      CPU_BASED_CR3_STORE_EXITING |
-	      CPU_BASED_USE_IO_BITMAPS |
+	      CPU_BASED_UNCOND_IO_EXITING |
 	      CPU_BASED_MOV_DR_EXITING |
 	      CPU_BASED_USE_TSC_OFFSETING |
 	      CPU_BASED_INVLPG_EXITING |
@@ -5468,10 +5464,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 #endif
 	int i;
 
-	/* I/O */
-	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
-	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
-
 	if (enable_shadow_vmcs) {
 		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
@@ -6783,10 +6775,6 @@ static __init int hardware_setup(void)
 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
-	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-
-	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
 	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
 	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 
@@ -10563,8 +10551,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	}
 
 	/*
-	 * Merging of IO bitmap not currently supported.
-	 * Rather, exit every time.
+	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
+	 * for I/O port accesses.
 	 */
 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;

From ec7660ccdd2b71d8c7f0243f8590253713e9b75d Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:23 +0100
Subject: [PATCH 062/676] KVM: Take vcpu->mutex outside vcpu_load

As we're about to call vcpu_load() from architecture-specific
implementations of the KVM vcpu ioctls, but yet we access data
structures protected by the vcpu->mutex in the generic code, factor
this logic out from vcpu_load().

x86 is the only architecture which calls vcpu_load() outside of the main
vcpu ioctl function, and these calls will no longer take the vcpu mutex
following this patch.  However, with the exception of
kvm_arch_vcpu_postcreate (see below), the callers are either in the
creation or destruction path of the VCPU, which means there cannot be
any concurrent access to the data structure, because the file descriptor
is not yet accessible, or is already gone.

kvm_arch_vcpu_postcreate makes the newly created vcpu potentially
accessible by other in-kernel threads through the kvm->vcpus array, and
we therefore take the vcpu mutex in this case directly.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c       |  4 +---
 arch/x86/kvm/x86.c       | 20 +++++++-------------
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 17 ++++++-----------
 4 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b016cf1b9f773..ef7d13e536a4f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9496,10 +9496,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int r;
 
-       r = vcpu_load(vcpu);
-       BUG_ON(r);
+       vcpu_load(vcpu);
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
        vcpu_put(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54d66f238e203..3f2c78f585702 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7767,16 +7767,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	int r;
-
 	kvm_vcpu_mtrr_init(vcpu);
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	vcpu_load(vcpu);
 	kvm_vcpu_reset(vcpu, false);
 	kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
-	return r;
+	return 0;
 }
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -7786,13 +7782,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 	kvm_hv_vcpu_postcreate(vcpu);
 
-	if (vcpu_load(vcpu))
+	if (mutex_lock_killable(&vcpu->mutex))
 		return;
+	vcpu_load(vcpu);
 	msr.data = 0x0;
 	msr.index = MSR_IA32_TSC;
 	msr.host_initiated = true;
 	kvm_write_tsc(vcpu, &msr);
 	vcpu_put(vcpu);
+	mutex_unlock(&vcpu->mutex);
 
 	if (!kvmclock_periodic_sync)
 		return;
@@ -7803,11 +7801,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	int r;
 	vcpu->arch.apf.msr_val = 0;
 
-	r = vcpu_load(vcpu);
-	BUG_ON(r);
+	vcpu_load(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
@@ -8179,9 +8175,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
 {
-	int r;
-	r = vcpu_load(vcpu);
-	BUG_ON(r);
+	vcpu_load(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6bdd4b9f66115..09de0ff3d6773 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -533,7 +533,7 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 
-int __must_check vcpu_load(struct kvm_vcpu *vcpu);
+void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bc092e3d1d731..c4d116b336f4e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -151,17 +151,12 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
-int vcpu_load(struct kvm_vcpu *vcpu)
+void vcpu_load(struct kvm_vcpu *vcpu)
 {
-	int cpu;
-
-	if (mutex_lock_killable(&vcpu->mutex))
-		return -EINTR;
-	cpu = get_cpu();
+	int cpu = get_cpu();
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
-	return 0;
 }
 EXPORT_SYMBOL_GPL(vcpu_load);
 
@@ -171,7 +166,6 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_put(vcpu);
 	preempt_notifier_unregister(&vcpu->preempt_notifier);
 	preempt_enable();
-	mutex_unlock(&vcpu->mutex);
 }
 EXPORT_SYMBOL_GPL(vcpu_put);
 
@@ -2560,9 +2554,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 #endif
 
 
-	r = vcpu_load(vcpu);
-	if (r)
-		return r;
+	if (mutex_lock_killable(&vcpu->mutex))
+		return -EINTR;
+	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_RUN: {
 		struct pid *oldpid;
@@ -2735,6 +2729,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	}
 out:
 	vcpu_put(vcpu);
+	mutex_unlock(&vcpu->mutex);
 	kfree(fpu);
 	kfree(kvm_sregs);
 	return r;

From 8a32dd60ec9488b73e04e5b7bc82b77a2580b1b7 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:24 +0100
Subject: [PATCH 063/676] KVM: Prepare for moving vcpu_load/vcpu_put into arch
 specific code

In preparation for moving calls to vcpu_load() and vcpu_put() into the
architecture specific implementations of the KVM vcpu ioctls, move the
calls in the main kvm_vcpu_ioctl() dispatcher function to each case
of the ioctl select statement.  This allows us to move the vcpu_load()
and vcpu_put() calls into architecture specific implementations of vcpu
ioctls, one by one.

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c4d116b336f4e..7bbaad8717a2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2556,13 +2556,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
 
 	if (mutex_lock_killable(&vcpu->mutex))
 		return -EINTR;
-	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_RUN: {
 		struct pid *oldpid;
 		r = -EINVAL;
 		if (arg)
 			goto out;
+		vcpu_load(vcpu);
 		oldpid = rcu_access_pointer(vcpu->pid);
 		if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
 			/* The thread running this VCPU changed. */
@@ -2574,6 +2574,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			put_pid(oldpid);
 		}
 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
+		vcpu_put(vcpu);
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}
@@ -2584,7 +2585,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
 		if (!kvm_regs)
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
+		vcpu_put(vcpu);
 		if (r)
 			goto out_free1;
 		r = -EFAULT;
@@ -2604,7 +2607,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			r = PTR_ERR(kvm_regs);
 			goto out;
 		}
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
+		vcpu_put(vcpu);
 		kfree(kvm_regs);
 		break;
 	}
@@ -2613,7 +2618,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -ENOMEM;
 		if (!kvm_sregs)
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
+		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -2629,13 +2636,17 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			kvm_sregs = NULL;
 			goto out;
 		}
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_MP_STATE: {
 		struct kvm_mp_state mp_state;
 
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
+		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -2650,7 +2661,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_TRANSLATE: {
@@ -2659,7 +2672,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&tr, argp, sizeof(tr)))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -2674,7 +2689,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof(dbg)))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_SET_SIGNAL_MASK: {
@@ -2705,7 +2722,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -ENOMEM;
 		if (!fpu)
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
+		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;
@@ -2721,14 +2740,17 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			fpu = NULL;
 			goto out;
 		}
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
+		vcpu_put(vcpu);
 		break;
 	}
 	default:
+		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+		vcpu_put(vcpu);
 	}
 out:
-	vcpu_put(vcpu);
 	mutex_unlock(&vcpu->mutex);
 	kfree(fpu);
 	kfree(kvm_sregs);

From accb757d798c9b4d85cfe3e5972134c586525168 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:25 +0100
Subject: [PATCH 064/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_run

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_run().

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com> # s390 parts
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
[Rebased. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       |  3 +++
 arch/powerpc/kvm/powerpc.c |  6 +++++-
 arch/s390/kvm/kvm-s390.c   | 10 ++++++++--
 arch/x86/kvm/x86.c         |  3 ++-
 virt/kvm/arm/arm.c         | 20 ++++++++++++++------
 virt/kvm/kvm_main.c        |  2 --
 6 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 75fdeaa8c62f2..ba5ecf22bb96f 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -446,6 +446,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r = -EINTR;
 
+	vcpu_load(vcpu);
+
 	kvm_sigset_activate(vcpu);
 
 	if (vcpu->mmio_needed) {
@@ -480,6 +482,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 out:
 	kvm_sigset_deactivate(vcpu);
 
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1915e86cef6f8..4e2167a7ae19e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1408,6 +1408,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
 	int r;
 
+	vcpu_load(vcpu);
+
 	if (vcpu->mmio_needed) {
 		vcpu->mmio_needed = 0;
 		if (!vcpu->mmio_is_write)
@@ -1422,7 +1424,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run);
 			if (r == RESUME_HOST) {
 				vcpu->mmio_needed = 1;
-				return r;
+				goto out;
 			}
 		}
 #endif
@@ -1456,6 +1458,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	kvm_sigset_deactivate(vcpu);
 
+out:
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ec8b68e97d3cd..7972598e60d03 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3373,9 +3373,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (kvm_run->immediate_exit)
 		return -EINTR;
 
+	vcpu_load(vcpu);
+
 	if (guestdbg_exit_pending(vcpu)) {
 		kvm_s390_prepare_debug_exit(vcpu);
-		return 0;
+		rc = 0;
+		goto out;
 	}
 
 	kvm_sigset_activate(vcpu);
@@ -3385,7 +3388,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	} else if (is_vcpu_stopped(vcpu)) {
 		pr_err_ratelimited("can't run stopped vcpu %d\n",
 				   vcpu->vcpu_id);
-		return -EINVAL;
+		rc = -EINVAL;
+		goto out;
 	}
 
 	sync_regs(vcpu, kvm_run);
@@ -3415,6 +3419,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_sigset_deactivate(vcpu);
 
 	vcpu->stat.exit_userspace++;
+out:
+	vcpu_put(vcpu);
 	return rc;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3f2c78f585702..af9da75011bce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7280,8 +7280,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
 
+	vcpu_load(vcpu);
 	kvm_sigset_activate(vcpu);
-
 	kvm_load_guest_fpu(vcpu);
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -7328,6 +7328,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	post_kvm_run_save(vcpu);
 	kvm_sigset_deactivate(vcpu);
 
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 6b60c98a6e229..a1b2e8a43ca0b 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -619,21 +619,27 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (unlikely(!kvm_vcpu_initialized(vcpu)))
 		return -ENOEXEC;
 
+	vcpu_load(vcpu);
+
 	ret = kvm_vcpu_first_run_init(vcpu);
 	if (ret)
-		return ret;
+		goto out;
 
 	if (run->exit_reason == KVM_EXIT_MMIO) {
 		ret = kvm_handle_mmio_return(vcpu, vcpu->run);
 		if (ret)
-			return ret;
-		if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
-			return 0;
+			goto out;
+		if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) {
+			ret = 0;
+			goto out;
+		}
 
 	}
 
-	if (run->immediate_exit)
-		return -EINTR;
+	if (run->immediate_exit) {
+		ret = -EINTR;
+		goto out;
+	}
 
 	kvm_sigset_activate(vcpu);
 
@@ -772,6 +778,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	kvm_sigset_deactivate(vcpu);
 
+out:
+	vcpu_put(vcpu);
 	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7bbaad8717a2f..0b149827570cc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2562,7 +2562,6 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (arg)
 			goto out;
-		vcpu_load(vcpu);
 		oldpid = rcu_access_pointer(vcpu->pid);
 		if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) {
 			/* The thread running this VCPU changed. */
@@ -2574,7 +2573,6 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			put_pid(oldpid);
 		}
 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
-		vcpu_put(vcpu);
 		trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
 		break;
 	}

From 1fc9b76b3dd2c57ca0fe42742043a5c3cbdc41c1 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:26 +0100
Subject: [PATCH 065/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_get_regs

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_get_regs().

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c      | 3 +++
 arch/powerpc/kvm/book3s.c | 3 +++
 arch/powerpc/kvm/booke.c  | 3 +++
 arch/s390/kvm/kvm-s390.c  | 2 ++
 arch/x86/kvm/x86.c        | 3 +++
 virt/kvm/kvm_main.c       | 2 --
 6 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index ba5ecf22bb96f..6023b5f808c0a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1162,6 +1162,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++)
 		regs->gpr[i] = vcpu->arch.gprs[i];
 
@@ -1169,6 +1171,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->lo = vcpu->arch.lo;
 	regs->pc = vcpu->arch.pc;
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 72d977e309523..d85bfd733ccd4 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -497,6 +497,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	regs->pc = kvmppc_get_pc(vcpu);
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = kvmppc_get_ctr(vcpu);
@@ -518,6 +520,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 83b485810aea2..e0e4f04c55354 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1431,6 +1431,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	regs->pc = vcpu->arch.pc;
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = vcpu->arch.ctr;
@@ -1452,6 +1454,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7972598e60d03..44317813f4982 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2715,7 +2715,9 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	vcpu_load(vcpu);
 	memcpy(&regs->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs));
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af9da75011bce..3364c6d4f7431 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7334,6 +7334,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	vcpu_load(vcpu);
+
 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
 		/*
 		 * We are here if userspace calls get_regs() in the middle of
@@ -7367,6 +7369,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0b149827570cc..6dab2e6f83217 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2583,9 +2583,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
 		if (!kvm_regs)
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
-		vcpu_put(vcpu);
 		if (r)
 			goto out_free1;
 		r = -EFAULT;

From 875656fe0c8473c544860d557ca1512753d6aeef Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:27 +0100
Subject: [PATCH 066/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_set_regs

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_set_regs().

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c      | 3 +++
 arch/powerpc/kvm/book3s.c | 3 +++
 arch/powerpc/kvm/booke.c  | 3 +++
 arch/s390/kvm/kvm-s390.c  | 2 ++
 arch/x86/kvm/x86.c        | 3 +++
 virt/kvm/kvm_main.c       | 2 --
 6 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 6023b5f808c0a..dd5f463b0e726 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -1148,6 +1148,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++)
 		vcpu->arch.gprs[i] = regs->gpr[i];
 	vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */
@@ -1155,6 +1157,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.lo = regs->lo;
 	vcpu->arch.pc = regs->pc;
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index d85bfd733ccd4..24bc7aabfc449 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -528,6 +528,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	kvmppc_set_pc(vcpu, regs->pc);
 	kvmppc_set_cr(vcpu, regs->cr);
 	kvmppc_set_ctr(vcpu, regs->ctr);
@@ -548,6 +550,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index e0e4f04c55354..bcbbeddc3430c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1462,6 +1462,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
+	vcpu_load(vcpu);
+
 	vcpu->arch.pc = regs->pc;
 	kvmppc_set_cr(vcpu, regs->cr);
 	vcpu->arch.ctr = regs->ctr;
@@ -1483,6 +1485,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 44317813f4982..ab08823a854b2 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2709,7 +2709,9 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	vcpu_load(vcpu);
 	memcpy(&vcpu->run->s.regs.gprs, &regs->gprs, sizeof(regs->gprs));
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3364c6d4f7431..f82e87bb3a51e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7375,6 +7375,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
+	vcpu_load(vcpu);
+
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 
@@ -7404,6 +7406,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6dab2e6f83217..e25c1a1a41202 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2603,9 +2603,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			r = PTR_ERR(kvm_regs);
 			goto out;
 		}
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
-		vcpu_put(vcpu);
 		kfree(kvm_regs);
 		break;
 	}

From bcdec41cefbea525ad424050650acb0f2eed1378 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:28 +0100
Subject: [PATCH 067/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_get_sregs

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_get_sregs().

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s.c | 8 +++++++-
 arch/powerpc/kvm/booke.c  | 9 ++++++++-
 arch/s390/kvm/kvm-s390.c  | 4 ++++
 arch/x86/kvm/x86.c        | 3 +++
 virt/kvm/kvm_main.c       | 2 --
 5 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 24bc7aabfc449..6cc2377549f78 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -484,7 +484,13 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+	int ret;
+
+	vcpu_load(vcpu);
+	ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+	vcpu_put(vcpu);
+
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index bcbbeddc3430c..f647e121070e1 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1613,11 +1613,18 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
+	int ret;
+
+	vcpu_load(vcpu);
+
 	sregs->pvr = vcpu->arch.pvr;
 
 	get_sregs_base(vcpu, sregs);
 	get_sregs_arch206(vcpu, sregs);
-	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+	ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ab08823a854b2..87544beff5077 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2734,8 +2734,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
+	vcpu_load(vcpu);
+
 	memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
+
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f82e87bb3a51e..66a8d8cd00f6a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7425,6 +7425,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 {
 	struct desc_ptr dt;
 
+	vcpu_load(vcpu);
+
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -7456,6 +7458,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e25c1a1a41202..4c2f6a4d18527 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2612,9 +2612,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -ENOMEM;
 		if (!kvm_sregs)
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
-		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;

From b4ef9d4e8cb8938e6c0aa3be672b0aeeb791ecf3 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:29 +0100
Subject: [PATCH 068/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_set_sregs

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_set_sregs().

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s.c |  8 +++++++-
 arch/powerpc/kvm/booke.c  | 15 ++++++++++-----
 arch/s390/kvm/kvm-s390.c  |  4 ++++
 arch/x86/kvm/x86.c        | 12 +++++++++---
 virt/kvm/kvm_main.c       |  2 --
 5 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6cc2377549f78..047651622cb86 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -496,7 +496,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+	int ret;
+
+	vcpu_load(vcpu);
+	ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+	vcpu_put(vcpu);
+
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index f647e121070e1..38939a204e262 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1630,20 +1630,25 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                   struct kvm_sregs *sregs)
 {
-	int ret;
+	int ret = -EINVAL;
 
+	vcpu_load(vcpu);
 	if (vcpu->arch.pvr != sregs->pvr)
-		return -EINVAL;
+		goto out;
 
 	ret = set_sregs_base(vcpu, sregs);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	ret = set_sregs_arch206(vcpu, sregs);
 	if (ret < 0)
-		return ret;
+		goto out;
+
+	ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
 
-	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 87544beff5077..d27a1dd0986b9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2726,8 +2726,12 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
+	vcpu_load(vcpu);
+
 	memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
+
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 66a8d8cd00f6a..f01a7d73dad78 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7525,15 +7525,18 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits, idx;
 	struct desc_ptr dt;
+	int ret = -EINVAL;
+
+	vcpu_load(vcpu);
 
 	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
 			(sregs->cr4 & X86_CR4_OSXSAVE))
-		return -EINVAL;
+		goto out;
 
 	apic_base_msr.data = sregs->apic_base;
 	apic_base_msr.host_initiated = true;
 	if (kvm_set_apic_base(vcpu, &apic_base_msr))
-		return -EINVAL;
+		goto out;
 
 	dt.size = sregs->idt.limit;
 	dt.address = sregs->idt.base;
@@ -7599,7 +7602,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
-	return 0;
+	ret = 0;
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4c2f6a4d18527..e04a216bcd149 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2628,9 +2628,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			kvm_sregs = NULL;
 			goto out;
 		}
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
-		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_MP_STATE: {

From fd2325612c1493c85cce89ea16b2396baca83311 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:30 +0100
Subject: [PATCH 069/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_get_mpstate

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_get_mpstate().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 11 +++++++++--
 arch/x86/kvm/x86.c       |  3 +++
 virt/kvm/arm/arm.c       |  3 +++
 virt/kvm/kvm_main.c      |  2 --
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d27a1dd0986b9..4297f5094a884 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2833,9 +2833,16 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	int ret;
+
+	vcpu_load(vcpu);
+
 	/* CHECK_STOP and LOAD are not supported yet */
-	return is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED :
-				       KVM_MP_STATE_OPERATING;
+	ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED :
+				      KVM_MP_STATE_OPERATING;
+
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f01a7d73dad78..80791939065fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7465,6 +7465,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	vcpu_load(vcpu);
+
 	kvm_apic_accept_events(vcpu);
 	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
 					vcpu->arch.pv.pv_unhalted)
@@ -7472,6 +7474,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 	else
 		mp_state->mp_state = vcpu->arch.mp_state;
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index a1b2e8a43ca0b..65d50100ba3ca 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -381,11 +381,14 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	vcpu_load(vcpu);
+
 	if (vcpu->arch.power_off)
 		mp_state->mp_state = KVM_MP_STATE_STOPPED;
 	else
 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e04a216bcd149..8e2f582417c38 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2634,9 +2634,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	case KVM_GET_MP_STATE: {
 		struct kvm_mp_state mp_state;
 
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
-		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;

From e83dff5edf0c3f014e4b4ac5e1c86dbe797687c7 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:31 +0100
Subject: [PATCH 070/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_set_mpstate

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_set_mpstate().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c |  3 +++
 arch/x86/kvm/x86.c       | 14 +++++++++++---
 virt/kvm/arm/arm.c       |  9 +++++++--
 virt/kvm/kvm_main.c      |  2 --
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4297f5094a884..4e6304d45ca55 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2850,6 +2850,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 	int rc = 0;
 
+	vcpu_load(vcpu);
+
 	/* user space knows about this interface - let it control the state */
 	vcpu->kvm->arch.user_cpu_state_ctrl = 1;
 
@@ -2867,6 +2869,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 		rc = -ENXIO;
 	}
 
+	vcpu_put(vcpu);
 	return rc;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 80791939065fb..a9e6cca1f46ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7481,15 +7481,19 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	int ret = -EINVAL;
+
+	vcpu_load(vcpu);
+
 	if (!lapic_in_kernel(vcpu) &&
 	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
-		return -EINVAL;
+		goto out;
 
 	/* INITs are latched while in SMM */
 	if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
 	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
 	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
-		return -EINVAL;
+		goto out;
 
 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
 		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
@@ -7497,7 +7501,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	} else
 		vcpu->arch.mp_state = mp_state->mp_state;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	return 0;
+
+	ret = 0;
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 65d50100ba3ca..aa6167086211d 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -395,6 +395,10 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
+	int ret = 0;
+
+	vcpu_load(vcpu);
+
 	switch (mp_state->mp_state) {
 	case KVM_MP_STATE_RUNNABLE:
 		vcpu->arch.power_off = false;
@@ -403,10 +407,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 		vcpu_power_off(vcpu);
 		break;
 	default:
-		return -EINVAL;
+		ret = -EINVAL;
 	}
 
-	return 0;
+	vcpu_put(vcpu);
+	return ret;
 }
 
 /**
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8e2f582417c38..4b5aeb4b8c580 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2649,9 +2649,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
-		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_TRANSLATE: {

From 1da5b61dac98360a7e50b1565f6d499c6fc8123a Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:32 +0100
Subject: [PATCH 071/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_translate

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_translate().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/booke.c | 2 ++
 arch/x86/kvm/x86.c       | 3 +++
 virt/kvm/kvm_main.c      | 2 --
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 38939a204e262..b5f46517c8396 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1791,7 +1791,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
 	int r;
 
+	vcpu_load(vcpu);
 	r = kvmppc_core_vcpu_translate(vcpu, tr);
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a9e6cca1f46ec..080e00dff426d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7684,6 +7684,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	gpa_t gpa;
 	int idx;
 
+	vcpu_load(vcpu);
+
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -7692,6 +7694,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	tr->writeable = 1;
 	tr->usermode = 0;
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4b5aeb4b8c580..37217ec647b41 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2658,9 +2658,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&tr, argp, sizeof(tr)))
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
-		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;

From 66b5656222990f1a536f5900ccd98539f9cf231f Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:33 +0100
Subject: [PATCH 072/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_set_guest_debug

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_set_guest_debug().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/guest.c    | 15 ++++++++++++---
 arch/powerpc/kvm/book3s.c |  2 ++
 arch/powerpc/kvm/booke.c  | 19 +++++++++++++------
 arch/s390/kvm/kvm-s390.c  | 16 ++++++++++++----
 arch/x86/kvm/x86.c        |  4 +++-
 virt/kvm/kvm_main.c       |  2 --
 6 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5c7f657dd2074..d7e3299a77346 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -361,10 +361,16 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg)
 {
+	int ret = 0;
+
+	vcpu_load(vcpu);
+
 	trace_kvm_set_guest_debug(vcpu, dbg->control);
 
-	if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
-		return -EINVAL;
+	if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (dbg->control & KVM_GUESTDBG_ENABLE) {
 		vcpu->guest_debug = dbg->control;
@@ -378,7 +384,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		/* If not enabled clear all flags */
 		vcpu->guest_debug = 0;
 	}
-	return 0;
+
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 047651622cb86..234531d1bee1e 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -755,7 +755,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 					struct kvm_guest_debug *dbg)
 {
+	vcpu_load(vcpu);
 	vcpu->guest_debug = dbg->control;
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b5f46517c8396..6038e2e7aee03 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -2016,12 +2016,15 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 {
 	struct debug_reg *dbg_reg;
 	int n, b = 0, w = 0;
+	int ret = 0;
+
+	vcpu_load(vcpu);
 
 	if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
 		vcpu->arch.dbg_reg.dbcr0 = 0;
 		vcpu->guest_debug = 0;
 		kvm_guest_protect_msr(vcpu, MSR_DE, false);
-		return 0;
+		goto out;
 	}
 
 	kvm_guest_protect_msr(vcpu, MSR_DE, true);
@@ -2053,8 +2056,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 #endif
 
 	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-		return 0;
+		goto out;
 
+	ret = -EINVAL;
 	for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) {
 		uint64_t addr = dbg->arch.bp[n].addr;
 		uint32_t type = dbg->arch.bp[n].type;
@@ -2065,21 +2069,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		if (type & ~(KVMPPC_DEBUG_WATCH_READ |
 			     KVMPPC_DEBUG_WATCH_WRITE |
 			     KVMPPC_DEBUG_BREAKPOINT))
-			return -EINVAL;
+			goto out;
 
 		if (type & KVMPPC_DEBUG_BREAKPOINT) {
 			/* Setting H/W breakpoint */
 			if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++))
-				return -EINVAL;
+				goto out;
 		} else {
 			/* Setting H/W watchpoint */
 			if (kvmppc_booke_add_watchpoint(dbg_reg, addr,
 							type, w++))
-				return -EINVAL;
+				goto out;
 		}
 	}
 
-	return 0;
+	ret = 0;
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4e6304d45ca55..1c82dda07bc57 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2801,13 +2801,19 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 {
 	int rc = 0;
 
+	vcpu_load(vcpu);
+
 	vcpu->guest_debug = 0;
 	kvm_s390_clear_bp_data(vcpu);
 
-	if (dbg->control & ~VALID_GUESTDBG_FLAGS)
-		return -EINVAL;
-	if (!sclp.has_gpere)
-		return -EINVAL;
+	if (dbg->control & ~VALID_GUESTDBG_FLAGS) {
+		rc = -EINVAL;
+		goto out;
+	}
+	if (!sclp.has_gpere) {
+		rc = -EINVAL;
+		goto out;
+	}
 
 	if (dbg->control & KVM_GUESTDBG_ENABLE) {
 		vcpu->guest_debug = dbg->control;
@@ -2827,6 +2833,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
 	}
 
+out:
+	vcpu_put(vcpu);
 	return rc;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 080e00dff426d..9ef0b9299e93d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7625,6 +7625,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	unsigned long rflags;
 	int i, r;
 
+	vcpu_load(vcpu);
+
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
 		if (vcpu->arch.exception.pending)
@@ -7670,7 +7672,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	r = 0;
 
 out:
-
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 37217ec647b41..b44762fcaf84d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2673,9 +2673,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&dbg, argp, sizeof(dbg)))
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
-		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_SET_SIGNAL_MASK: {

From 1393123e1e24aba96413d351b9546086ea07504d Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:34 +0100
Subject: [PATCH 073/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_get_fpu

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_get_fpu().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 4 ++++
 arch/x86/kvm/x86.c       | 7 +++++--
 virt/kvm/kvm_main.c      | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 1c82dda07bc57..39327888bb064 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2762,6 +2762,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
+	vcpu_load(vcpu);
+
 	/* make sure we have the latest values */
 	save_fpu_regs();
 	if (MACHINE_HAS_VX)
@@ -2770,6 +2772,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	else
 		memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs));
 	fpu->fpc = vcpu->run->s.regs.fpc;
+
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ef0b9299e93d..846d292ea33b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7702,9 +7702,11 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxregs_state *fxsave =
-			&vcpu->arch.guest_fpu.state.fxsave;
+	struct fxregs_state *fxsave;
 
+	vcpu_load(vcpu);
+
+	fxsave = &vcpu->arch.guest_fpu.state.fxsave;
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -7714,6 +7716,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fpu->last_dp = fxsave->rdp;
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b44762fcaf84d..5791aa687233d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2704,9 +2704,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		r = -ENOMEM;
 		if (!fpu)
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
-		vcpu_put(vcpu);
 		if (r)
 			goto out;
 		r = -EFAULT;

From 6a96bc7fa0cdd96bac2b8298d708a94f8de6f6d4 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:35 +0100
Subject: [PATCH 074/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl_set_fpu

Move vcpu_load() and vcpu_put() into the architecture specific
implementations of kvm_arch_vcpu_ioctl_set_fpu().

Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 15 ++++++++++++---
 arch/x86/kvm/x86.c       |  8 ++++++--
 virt/kvm/kvm_main.c      |  2 --
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 39327888bb064..3bd2f83e1fa98 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2749,15 +2749,24 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	if (test_fp_ctl(fpu->fpc))
-		return -EINVAL;
+	int ret = 0;
+
+	vcpu_load(vcpu);
+
+	if (test_fp_ctl(fpu->fpc)) {
+		ret = -EINVAL;
+		goto out;
+	}
 	vcpu->run->s.regs.fpc = fpu->fpc;
 	if (MACHINE_HAS_VX)
 		convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs,
 				 (freg_t *) fpu->fprs);
 	else
 		memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs));
-	return 0;
+
+out:
+	vcpu_put(vcpu);
+	return ret;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 846d292ea33b3..981b61531ab74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7722,8 +7722,11 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxregs_state *fxsave =
-			&vcpu->arch.guest_fpu.state.fxsave;
+	struct fxregs_state *fxsave;
+
+	vcpu_load(vcpu);
+
+	fxsave = &vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -7734,6 +7737,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fxsave->rdp = fpu->last_dp;
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 
+	vcpu_put(vcpu);
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5791aa687233d..ca0ec9fb72ce9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2720,9 +2720,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 			fpu = NULL;
 			goto out;
 		}
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
-		vcpu_put(vcpu);
 		break;
 	}
 	default:

From 9b062471e52a1692c5563ba1535c84d708e2ff6f Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Dec 2017 21:35:36 +0100
Subject: [PATCH 075/676] KVM: Move vcpu_load to arch-specific
 kvm_arch_vcpu_ioctl

Move the calls to vcpu_load() and vcpu_put() in to the architecture
specific implementations of kvm_arch_vcpu_ioctl() which dispatches
further architecture-specific ioctls on to other functions.

Some architectures support asynchronous vcpu ioctls which cannot call
vcpu_load() or take the vcpu->mutex, because that would prevent
concurrent execution with a running VCPU, which is the intended purpose
of these ioctls, for example because they inject interrupts.

We repeat the separate checks for these specifics in the architecture
code for MIPS, S390 and PPC, and avoid taking the vcpu->mutex and
calling vcpu_load for these ioctls.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c       | 49 +++++++++++++++++++-------------
 arch/powerpc/kvm/powerpc.c | 13 +++++----
 arch/s390/kvm/kvm-s390.c   | 19 +++++++------
 arch/x86/kvm/x86.c         | 22 +++++++++++----
 virt/kvm/arm/arm.c         | 58 ++++++++++++++++++++++++++------------
 virt/kvm/kvm_main.c        |  2 --
 6 files changed, 103 insertions(+), 60 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index dd5f463b0e726..9200b3def440a 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -910,56 +910,65 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 	void __user *argp = (void __user *)arg;
 	long r;
 
+	if (ioctl == KVM_INTERRUPT) {
+		struct kvm_mips_interrupt irq;
+
+		if (copy_from_user(&irq, argp, sizeof(irq)))
+			return -EFAULT;
+		kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
+			  irq.irq);
+
+		return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+	}
+
+	vcpu_load(vcpu);
+
 	switch (ioctl) {
 	case KVM_SET_ONE_REG:
 	case KVM_GET_ONE_REG: {
 		struct kvm_one_reg reg;
 
+		r = -EFAULT;
 		if (copy_from_user(&reg, argp, sizeof(reg)))
-			return -EFAULT;
+			break;
 		if (ioctl == KVM_SET_ONE_REG)
-			return kvm_mips_set_reg(vcpu, &reg);
+			r = kvm_mips_set_reg(vcpu, &reg);
 		else
-			return kvm_mips_get_reg(vcpu, &reg);
+			r = kvm_mips_get_reg(vcpu, &reg);
+		break;
 	}
 	case KVM_GET_REG_LIST: {
 		struct kvm_reg_list __user *user_list = argp;
 		struct kvm_reg_list reg_list;
 		unsigned n;
 
+		r = -EFAULT;
 		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-			return -EFAULT;
+			break;
 		n = reg_list.n;
 		reg_list.n = kvm_mips_num_regs(vcpu);
 		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-			return -EFAULT;
+			break;
+		r = -E2BIG;
 		if (n < reg_list.n)
-			return -E2BIG;
-		return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
-	}
-	case KVM_INTERRUPT:
-		{
-			struct kvm_mips_interrupt irq;
-
-			if (copy_from_user(&irq, argp, sizeof(irq)))
-				return -EFAULT;
-			kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
-				  irq.irq);
-
-			r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 			break;
-		}
+		r = kvm_mips_copy_reg_indices(vcpu, user_list->reg);
+		break;
+	}
 	case KVM_ENABLE_CAP: {
 		struct kvm_enable_cap cap;
 
+		r = -EFAULT;
 		if (copy_from_user(&cap, argp, sizeof(cap)))
-			return -EFAULT;
+			break;
 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
 		break;
 	}
 	default:
 		r = -ENOIOCTLCMD;
 	}
+
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4e2167a7ae19e..ba8134a989c1b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1614,16 +1614,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	long r;
 
-	switch (ioctl) {
-	case KVM_INTERRUPT: {
+	if (ioctl == KVM_INTERRUPT) {
 		struct kvm_interrupt irq;
-		r = -EFAULT;
 		if (copy_from_user(&irq, argp, sizeof(irq)))
-			goto out;
-		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		goto out;
+			return -EFAULT;
+		return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 	}
 
+	vcpu_load(vcpu);
+
+	switch (ioctl) {
 	case KVM_ENABLE_CAP:
 	{
 		struct kvm_enable_cap cap;
@@ -1663,6 +1663,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 
 out:
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3bd2f83e1fa98..9700d71cb6911 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3737,24 +3737,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_S390_IRQ: {
 		struct kvm_s390_irq s390irq;
 
-		r = -EFAULT;
 		if (copy_from_user(&s390irq, argp, sizeof(s390irq)))
-			break;
-		r = kvm_s390_inject_vcpu(vcpu, &s390irq);
-		break;
+			return -EFAULT;
+		return kvm_s390_inject_vcpu(vcpu, &s390irq);
 	}
 	case KVM_S390_INTERRUPT: {
 		struct kvm_s390_interrupt s390int;
 		struct kvm_s390_irq s390irq;
 
-		r = -EFAULT;
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
-			break;
+			return -EFAULT;
 		if (s390int_to_s390irq(&s390int, &s390irq))
 			return -EINVAL;
-		r = kvm_s390_inject_vcpu(vcpu, &s390irq);
-		break;
+		return kvm_s390_inject_vcpu(vcpu, &s390irq);
 	}
+	}
+
+	vcpu_load(vcpu);
+
+	switch (ioctl) {
 	case KVM_S390_STORE_STATUS:
 		idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = kvm_s390_vcpu_store_status(vcpu, arg);
@@ -3879,6 +3880,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	default:
 		r = -ENOTTY;
 	}
+
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 981b61531ab74..fafaef072f1b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3484,6 +3484,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		void *buffer;
 	} u;
 
+	vcpu_load(vcpu);
+
 	u.buffer = NULL;
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
@@ -3509,8 +3511,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (!lapic_in_kernel(vcpu))
 			goto out;
 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
-		if (IS_ERR(u.lapic))
-			return PTR_ERR(u.lapic);
+		if (IS_ERR(u.lapic)) {
+			r = PTR_ERR(u.lapic);
+			goto out_nofree;
+		}
 
 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		break;
@@ -3684,8 +3688,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_XSAVE: {
 		u.xsave = memdup_user(argp, sizeof(*u.xsave));
-		if (IS_ERR(u.xsave))
-			return PTR_ERR(u.xsave);
+		if (IS_ERR(u.xsave)) {
+			r = PTR_ERR(u.xsave);
+			goto out_nofree;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
@@ -3707,8 +3713,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_XCRS: {
 		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-		if (IS_ERR(u.xcrs))
-			return PTR_ERR(u.xcrs);
+		if (IS_ERR(u.xcrs)) {
+			r = PTR_ERR(u.xcrs);
+			goto out_nofree;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
@@ -3752,6 +3760,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 out:
 	kfree(u.buffer);
+out_nofree:
+	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index aa6167086211d..cd7d90c9f644d 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1003,66 +1003,88 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	struct kvm_device_attr attr;
+	long r;
+
+	vcpu_load(vcpu);
 
 	switch (ioctl) {
 	case KVM_ARM_VCPU_INIT: {
 		struct kvm_vcpu_init init;
 
+		r = -EFAULT;
 		if (copy_from_user(&init, argp, sizeof(init)))
-			return -EFAULT;
+			break;
 
-		return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
+		r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
+		break;
 	}
 	case KVM_SET_ONE_REG:
 	case KVM_GET_ONE_REG: {
 		struct kvm_one_reg reg;
 
+		r = -ENOEXEC;
 		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			return -ENOEXEC;
+			break;
 
+		r = -EFAULT;
 		if (copy_from_user(&reg, argp, sizeof(reg)))
-			return -EFAULT;
+			break;
+
 		if (ioctl == KVM_SET_ONE_REG)
-			return kvm_arm_set_reg(vcpu, &reg);
+			r = kvm_arm_set_reg(vcpu, &reg);
 		else
-			return kvm_arm_get_reg(vcpu, &reg);
+			r = kvm_arm_get_reg(vcpu, &reg);
+		break;
 	}
 	case KVM_GET_REG_LIST: {
 		struct kvm_reg_list __user *user_list = argp;
 		struct kvm_reg_list reg_list;
 		unsigned n;
 
+		r = -ENOEXEC;
 		if (unlikely(!kvm_vcpu_initialized(vcpu)))
-			return -ENOEXEC;
+			break;
 
+		r = -EFAULT;
 		if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
-			return -EFAULT;
+			break;
 		n = reg_list.n;
 		reg_list.n = kvm_arm_num_regs(vcpu);
 		if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
-			return -EFAULT;
+			break;
+		r = -E2BIG;
 		if (n < reg_list.n)
-			return -E2BIG;
-		return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
+			break;
+		r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
+		break;
 	}
 	case KVM_SET_DEVICE_ATTR: {
+		r = -EFAULT;
 		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_set_attr(vcpu, &attr);
+			break;
+		r = kvm_arm_vcpu_set_attr(vcpu, &attr);
+		break;
 	}
 	case KVM_GET_DEVICE_ATTR: {
+		r = -EFAULT;
 		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_get_attr(vcpu, &attr);
+			break;
+		r = kvm_arm_vcpu_get_attr(vcpu, &attr);
+		break;
 	}
 	case KVM_HAS_DEVICE_ATTR: {
+		r = -EFAULT;
 		if (copy_from_user(&attr, argp, sizeof(attr)))
-			return -EFAULT;
-		return kvm_arm_vcpu_has_attr(vcpu, &attr);
+			break;
+		r = kvm_arm_vcpu_has_attr(vcpu, &attr);
+		break;
 	}
 	default:
-		return -EINVAL;
+		r = -EINVAL;
 	}
+
+	vcpu_put(vcpu);
+	return r;
 }
 
 /**
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ca0ec9fb72ce9..19c184fa1839a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2724,9 +2724,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	default:
-		vcpu_load(vcpu);
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-		vcpu_put(vcpu);
 	}
 out:
 	mutex_unlock(&vcpu->mutex);

From 5cb0944c0c66004c0d9006a7f0fba5782ae38f69 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 12 Dec 2017 17:41:34 +0100
Subject: [PATCH 076/676] KVM: introduce kvm_arch_vcpu_async_ioctl

After the vcpu_load/vcpu_put pushdown, the handling of asynchronous VCPU
ioctl is already much clearer in that it is obvious that they bypass
vcpu_load and vcpu_put.

However, it is still not perfect in that the different state of the VCPU
mutex is still hidden in the caller.  Separate those ioctls into a new
function kvm_arch_vcpu_async_ioctl that returns -ENOIOCTLCMD for more
"traditional" synchronous ioctls.

Cc: James Hogan <jhogan@kernel.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Suggested-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/Kconfig      |  1 +
 arch/mips/kvm/mips.c       | 15 ++++++++++++---
 arch/powerpc/kvm/Kconfig   |  1 +
 arch/powerpc/kvm/powerpc.c | 14 +++++++++++---
 arch/s390/kvm/Kconfig      |  1 +
 arch/s390/kvm/kvm-s390.c   | 16 ++++++++++++----
 include/linux/kvm_host.h   | 12 ++++++++++++
 virt/kvm/Kconfig           |  3 +++
 virt/kvm/kvm_main.c        | 12 +++++-------
 9 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index b17447ce88731..76b93a9c8c9b2 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select KVM_MMIO
 	select MMU_NOTIFIER
 	select SRCU
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 9200b3def440a..2549fdd27ee16 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -903,12 +903,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
-			 unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl,
+			       unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
-	long r;
 
 	if (ioctl == KVM_INTERRUPT) {
 		struct kvm_mips_interrupt irq;
@@ -921,6 +920,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
 		return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 	}
 
+	return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
+			 unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
 	vcpu_load(vcpu);
 
 	switch (ioctl) {
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b12b8eb39c297..f884a0529dfeb 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select HAVE_KVM_EVENTFD
+	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select SRCU
 	select KVM_VFIO
 	select IRQ_BYPASS_MANAGER
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ba8134a989c1b..66a310779de54 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1607,12 +1607,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return -EINVAL;
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp,
-                         unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+			       unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
-	long r;
 
 	if (ioctl == KVM_INTERRUPT) {
 		struct kvm_interrupt irq;
@@ -1620,6 +1619,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			return -EFAULT;
 		return kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 	}
+	return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
 
 	vcpu_load(vcpu);
 
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 9a4594e0a1ffe..a3dbd459cce91 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
+	select HAVE_KVM_VCPU_ASYNC_IOCTL
 	select HAVE_KVM_EVENTFD
 	select KVM_ASYNC_PF
 	select KVM_ASYNC_PF_SYNC
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9700d71cb6911..40f0ae5a883f0 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3725,13 +3725,11 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 	return r;
 }
 
-long kvm_arch_vcpu_ioctl(struct file *filp,
-			 unsigned int ioctl, unsigned long arg)
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+			       unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
-	int idx;
-	long r;
 
 	switch (ioctl) {
 	case KVM_S390_IRQ: {
@@ -3752,6 +3750,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		return kvm_s390_inject_vcpu(vcpu, &s390irq);
 	}
 	}
+	return -ENOIOCTLCMD;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int idx;
+	long r;
 
 	vcpu_load(vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 09de0ff3d6773..ac0062b74aed0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1260,4 +1260,16 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */
 
+#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL
+long kvm_arch_vcpu_async_ioctl(struct file *filp,
+			       unsigned int ioctl, unsigned long arg);
+#else
+static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
+					     unsigned int ioctl,
+					     unsigned long arg)
+{
+	return -ENOIOCTLCMD;
+}
+#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 70691c08e1edc..cca7e065a075d 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -51,3 +51,6 @@ config KVM_COMPAT
 
 config HAVE_KVM_IRQ_BYPASS
        bool
+
+config HAVE_KVM_VCPU_ASYNC_IOCTL
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 19c184fa1839a..b4414842b0239 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2544,15 +2544,13 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
 		return -EINVAL;
 
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
 	/*
-	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
-	 * so vcpu_load() would break it.
+	 * Some architectures have vcpu ioctls that are asynchronous to vcpu
+	 * execution; mutex_lock() would break them.
 	 */
-	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
-		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-#endif
-
+	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
+	if (r != -ENOIOCTLCMD)
+		return r;
 
 	if (mutex_lock_killable(&vcpu->mutex))
 		return -EINTR;

From a376cd91606365609d8fbd57247618bd51da1fc6 Mon Sep 17 00:00:00 2001
From: Bhumika Goyal <bhumirks@gmail.com>
Date: Sat, 5 Aug 2017 13:50:42 +0530
Subject: [PATCH 077/676] platform/chrome: chromeos_laptop: make
 chromeos_laptop const

Declare chromeos_laptop structures as const as they are only used during
a copy operation. As their value is never modified during runtime, they
can be made const.

Signed-off-by: Bhumika Goyal <bhumirks@gmail.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/chromeos_laptop.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/platform/chrome/chromeos_laptop.c b/drivers/platform/chrome/chromeos_laptop.c
index d8599736a41a2..6dec6ab133007 100644
--- a/drivers/platform/chrome/chromeos_laptop.c
+++ b/drivers/platform/chrome/chromeos_laptop.c
@@ -423,7 +423,7 @@ static int chromeos_laptop_probe(struct platform_device *pdev)
 	return ret;
 }
 
-static struct chromeos_laptop samsung_series_5_550 = {
+static const struct chromeos_laptop samsung_series_5_550 = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_SMBUS },
@@ -432,14 +432,14 @@ static struct chromeos_laptop samsung_series_5_550 = {
 	},
 };
 
-static struct chromeos_laptop samsung_series_5 = {
+static const struct chromeos_laptop samsung_series_5 = {
 	.i2c_peripherals = {
 		/* Light Sensor. */
 		{ .add = setup_tsl2583_als, I2C_ADAPTER_SMBUS },
 	},
 };
 
-static struct chromeos_laptop chromebook_pixel = {
+static const struct chromeos_laptop chromebook_pixel = {
 	.i2c_peripherals = {
 		/* Touch Screen. */
 		{ .add = setup_atmel_1664s_ts, I2C_ADAPTER_PANEL },
@@ -450,14 +450,14 @@ static struct chromeos_laptop chromebook_pixel = {
 	},
 };
 
-static struct chromeos_laptop hp_chromebook_14 = {
+static const struct chromeos_laptop hp_chromebook_14 = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_DESIGNWARE_0 },
 	},
 };
 
-static struct chromeos_laptop dell_chromebook_11 = {
+static const struct chromeos_laptop dell_chromebook_11 = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_DESIGNWARE_0 },
@@ -466,28 +466,28 @@ static struct chromeos_laptop dell_chromebook_11 = {
 	},
 };
 
-static struct chromeos_laptop toshiba_cb35 = {
+static const struct chromeos_laptop toshiba_cb35 = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_DESIGNWARE_0 },
 	},
 };
 
-static struct chromeos_laptop acer_c7_chromebook = {
+static const struct chromeos_laptop acer_c7_chromebook = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_SMBUS },
 	},
 };
 
-static struct chromeos_laptop acer_ac700 = {
+static const struct chromeos_laptop acer_ac700 = {
 	.i2c_peripherals = {
 		/* Light Sensor. */
 		{ .add = setup_tsl2563_als, I2C_ADAPTER_SMBUS },
 	},
 };
 
-static struct chromeos_laptop acer_c720 = {
+static const struct chromeos_laptop acer_c720 = {
 	.i2c_peripherals = {
 		/* Touchscreen. */
 		{ .add = setup_atmel_1664s_ts, I2C_ADAPTER_DESIGNWARE_1 },
@@ -500,14 +500,14 @@ static struct chromeos_laptop acer_c720 = {
 	},
 };
 
-static struct chromeos_laptop hp_pavilion_14_chromebook = {
+static const struct chromeos_laptop hp_pavilion_14_chromebook = {
 	.i2c_peripherals = {
 		/* Touchpad. */
 		{ .add = setup_cyapa_tp, I2C_ADAPTER_SMBUS },
 	},
 };
 
-static struct chromeos_laptop cr48 = {
+static const struct chromeos_laptop cr48 = {
 	.i2c_peripherals = {
 		/* Light Sensor. */
 		{ .add = setup_tsl2563_als, I2C_ADAPTER_SMBUS },

From 50a0d71a5d20e1d3eff1d974fdc8559ad6d74892 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Dec 2017 15:49:48 +0100
Subject: [PATCH 078/676] cros_ec: fix nul-termination for firmware build info

As gcc-8 reports, we zero out the wrong byte:

drivers/platform/chrome/cros_ec_sysfs.c: In function 'show_ec_version':
drivers/platform/chrome/cros_ec_sysfs.c:190:12: error: array subscript 4294967295 is above array bounds of 'uint8_t[]' [-Werror=array-bounds]

This changes the code back to what it did before changing to a
zero-length array structure.

Fixes: a841178445bb ("mfd: cros_ec: Use a zero-length array for command data")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index d6eebe8721875..da0a719d32f75 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -185,7 +185,7 @@ static ssize_t show_ec_version(struct device *dev,
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Build info:    EC error %d\n", msg->result);
 	else {
-		msg->data[sizeof(msg->data) - 1] = '\0';
+		msg->data[EC_HOST_PARAM_SIZE - 1] = '\0';
 		count += scnprintf(buf + count, PAGE_SIZE - count,
 				   "Build info:    %s\n", msg->data);
 	}

From d3b56c566d4ba8cae688baf3cca94425d57ea783 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 31 Oct 2017 10:27:47 +0000
Subject: [PATCH 079/676] platform/chrome: cros_ec_lpc: remove redundant
 pointer request

Pointer request is being assigned but never used, so remove it. Cleans
up the clang warning:

drivers/platform/chrome/cros_ec_lpc.c:68:2: warning: Value stored to
'request' is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_lpc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 1baf720faf690..87e9747d229af 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -54,7 +54,6 @@ static int ec_response_timed_out(void)
 static int cros_ec_pkt_xfer_lpc(struct cros_ec_device *ec,
 				struct cros_ec_command *msg)
 {
-	struct ec_host_request *request;
 	struct ec_host_response response;
 	u8 sum;
 	int ret = 0;
@@ -65,8 +64,6 @@ static int cros_ec_pkt_xfer_lpc(struct cros_ec_device *ec,
 	/* Write buffer */
 	cros_ec_lpc_write_bytes(EC_LPC_ADDR_HOST_PACKET, ret, ec->dout);
 
-	request = (struct ec_host_request *)ec->dout;
-
 	/* Here we go */
 	sum = EC_COMMAND_PROTOCOL_3;
 	cros_ec_lpc_write_bytes(EC_LPC_ADDR_HOST_CMD, 1, &sum);

From 5f454bdf63536d7d0142be3cbb9e83899a199ec8 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Fri, 1 Dec 2017 14:42:21 +0100
Subject: [PATCH 080/676] platform/chrome: cros_ec_lpc: Register the driver if
 ACPI entry is missing.

Commit 12278dc7c572 ("platform/chrome: cros_ec_lpc: Add support for
GOOG004 ACPI device") added support when the firmware reports the ACPI
device, there are some firmwares though that doesn't report this device
but have it. In such cases we need to instantiate the driver explicitly
if it is not instantiated through ACPI.

Fixes: 12278dc7c572 ("platform/chrome: cros_ec_lpc: Add support for GOOG004 ACPI device")
Signed-off-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_lpc.c | 34 ++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 87e9747d229af..47ace99389524 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -35,6 +35,9 @@
 #define DRV_NAME "cros_ec_lpcs"
 #define ACPI_DRV_NAME "GOOG0004"
 
+/* True if ACPI device is present */
+static bool cros_ec_lpc_acpi_device_found;
+
 static int ec_response_timed_out(void)
 {
 	unsigned long one_second = jiffies + HZ;
@@ -393,9 +396,21 @@ static struct platform_driver cros_ec_lpc_driver = {
 	.remove = cros_ec_lpc_remove,
 };
 
+static struct platform_device cros_ec_lpc_device = {
+	.name = DRV_NAME
+};
+
+static acpi_status cros_ec_lpc_parse_device(acpi_handle handle, u32 level,
+					    void *context, void **retval)
+{
+	*(bool *)context = true;
+	return AE_CTRL_TERMINATE;
+}
+
 static int __init cros_ec_lpc_init(void)
 {
 	int ret;
+	acpi_status status;
 
 	if (!dmi_check_system(cros_ec_lpc_dmi_table)) {
 		pr_err(DRV_NAME ": unsupported system.\n");
@@ -412,11 +427,28 @@ static int __init cros_ec_lpc_init(void)
 		return ret;
 	}
 
-	return 0;
+	status = acpi_get_devices(ACPI_DRV_NAME, cros_ec_lpc_parse_device,
+				  &cros_ec_lpc_acpi_device_found, NULL);
+	if (ACPI_FAILURE(status))
+		pr_warn(DRV_NAME ": Looking for %s failed\n", ACPI_DRV_NAME);
+
+	if (!cros_ec_lpc_acpi_device_found) {
+		/* Register the device, and it'll get hooked up automatically */
+		ret = platform_device_register(&cros_ec_lpc_device);
+		if (ret) {
+			pr_err(DRV_NAME ": can't register device: %d\n", ret);
+			platform_driver_unregister(&cros_ec_lpc_driver);
+			cros_ec_lpc_reg_destroy();
+		}
+	}
+
+	return ret;
 }
 
 static void __exit cros_ec_lpc_exit(void)
 {
+	if (!cros_ec_lpc_acpi_device_found)
+		platform_device_unregister(&cros_ec_lpc_device);
 	platform_driver_unregister(&cros_ec_lpc_driver);
 	cros_ec_lpc_reg_destroy();
 }

From e675191729029cfb1cd624a03864f6426906a078 Mon Sep 17 00:00:00 2001
From: Thierry Escande <thierry.escande@collabora.com>
Date: Fri, 1 Dec 2017 14:42:22 +0100
Subject: [PATCH 081/676] platform/chrome: cros_ec_lpc: Add support for Google
 Glimmer

This patch adds device information to the DMI table of the cros_ec_lpc
driver for Google Glimmer devices. Since Google BIOS does not enumerate
devices in the LPC bus, the cros_ec_lpc driver checks for system
compatibility and registers the cros_ec device itself.

Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Reviewed-by: Gwendal Grignou <gwendal@chromium.org>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_lpc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index 47ace99389524..af89e82eecd23 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -362,6 +362,13 @@ static const struct dmi_system_id cros_ec_lpc_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "Peppy"),
 		},
 	},
+	{
+		/* x86-glimmer, the Lenovo Thinkpad Yoga 11e. */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "GOOGLE"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Glimmer"),
+		},
+	},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(dmi, cros_ec_lpc_dmi_table);

From d48b8c58c57f6edbe2965f0a5f62c5cf9593ca96 Mon Sep 17 00:00:00 2001
From: Shawn Nematbakhsh <shawnn@chromium.org>
Date: Fri, 8 Sep 2017 13:50:11 -0700
Subject: [PATCH 082/676] platform/chrome: Use proper protocol transfer
 function

pkt_xfer should be used for protocol v3, and cmd_xfer otherwise. We had
one instance of these functions correct, but not the second, fall-back
case. We use the fall-back only when the first command returns an
IN_PROGRESS status, which is only used on some EC firmwares where we
don't want to constantly poll the bus, but instead back off and
sleep/retry for a little while.

Fixes: 2c7589af3c4d ("mfd: cros_ec: add proto v3 skeleton")
Signed-off-by: Shawn Nematbakhsh <shawnn@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: Benson Leung <bleung@chromium.org>
---
 drivers/platform/chrome/cros_ec_proto.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 8dfa7fcb12488..e7bbdf947bbcf 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -60,12 +60,14 @@ static int send_command(struct cros_ec_device *ec_dev,
 			struct cros_ec_command *msg)
 {
 	int ret;
+	int (*xfer_fxn)(struct cros_ec_device *ec, struct cros_ec_command *msg);
 
 	if (ec_dev->proto_version > 2)
-		ret = ec_dev->pkt_xfer(ec_dev, msg);
+		xfer_fxn = ec_dev->pkt_xfer;
 	else
-		ret = ec_dev->cmd_xfer(ec_dev, msg);
+		xfer_fxn = ec_dev->cmd_xfer;
 
+	ret = (*xfer_fxn)(ec_dev, msg);
 	if (msg->result == EC_RES_IN_PROGRESS) {
 		int i;
 		struct cros_ec_command *status_msg;
@@ -88,7 +90,7 @@ static int send_command(struct cros_ec_device *ec_dev,
 		for (i = 0; i < EC_COMMAND_RETRIES; i++) {
 			usleep_range(10000, 11000);
 
-			ret = ec_dev->cmd_xfer(ec_dev, status_msg);
+			ret = (*xfer_fxn)(ec_dev, status_msg);
 			if (ret < 0)
 				break;
 

From 0c0543a128bd1c6a4c8610d0d9d869053fa2fbf5 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Sat, 25 Nov 2017 18:40:31 +0100
Subject: [PATCH 083/676] arm64: KVM: Hide PMU from guests when disabled

Since commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU
features from guests") we can hide cpu features from guests. Apply
this to a long standing issue where guests see a PMU available, but
it's not, because it was not enabled by KVM's userspace.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/sys_regs.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1830ebc227d18..503144f13af02 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -881,18 +881,25 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
 }
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
+static u64 read_id_reg(struct kvm_vcpu *vcpu,
+		       struct sys_reg_desc const *r,
+		       bool raz)
 {
 	u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
 			 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
 	u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
 
-	if (id == SYS_ID_AA64PFR0_EL1) {
+	switch (id) {
+	case SYS_ID_AA64DFR0_EL1:
+		if (!kvm_arm_pmu_v3_ready(vcpu))
+			val &= ~(0xfUL << ID_AA64DFR0_PMUVER_SHIFT);
+		break;
+	case SYS_ID_AA64PFR0_EL1:
 		if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT))
 			pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n",
 				    task_pid_nr(current));
-
 		val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
+		break;
 	}
 
 	return val;
@@ -908,7 +915,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu,
 	if (p->is_write)
 		return write_to_read_only(vcpu, p, r);
 
-	p->regval = read_id_reg(r, raz);
+	p->regval = read_id_reg(vcpu, r, raz);
 	return true;
 }
 
@@ -937,17 +944,17 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
  * are stored, and for set_id_reg() we don't allow the effective value
  * to be changed.
  */
-static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
-			bool raz)
+static int __get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+			void __user *uaddr, bool raz)
 {
 	const u64 id = sys_reg_to_index(rd);
-	const u64 val = read_id_reg(rd, raz);
+	const u64 val = read_id_reg(vcpu, rd, raz);
 
 	return reg_to_user(uaddr, &val, id);
 }
 
-static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
-			bool raz)
+static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+			void __user *uaddr, bool raz)
 {
 	const u64 id = sys_reg_to_index(rd);
 	int err;
@@ -958,7 +965,7 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
 		return err;
 
 	/* This is what we mean by invariant: you can't change it. */
-	if (val != read_id_reg(rd, raz))
+	if (val != read_id_reg(vcpu, rd, raz))
 		return -EINVAL;
 
 	return 0;
@@ -967,25 +974,25 @@ static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __get_id_reg(rd, uaddr, false);
+	return __get_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __set_id_reg(rd, uaddr, false);
+	return __set_id_reg(vcpu, rd, uaddr, false);
 }
 
 static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 			  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __get_id_reg(rd, uaddr, true);
+	return __get_id_reg(vcpu, rd, uaddr, true);
 }
 
 static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 			  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __set_id_reg(rd, uaddr, true);
+	return __set_id_reg(vcpu, rd, uaddr, true);
 }
 
 /* sys_reg_desc initialiser for known cpufeature ID registers */

From 4404b336cf32bf709942067fe71d8b5cf70fb2b2 Mon Sep 17 00:00:00 2001
From: Vasyl Gomonovych <gomonovych@gmail.com>
Date: Tue, 28 Nov 2017 23:48:17 +0100
Subject: [PATCH 084/676] KVM: arm: Use PTR_ERR_OR_ZERO()

Fix ptr_ret.cocci warnings:
virt/kvm/arm/vgic/vgic-its.c:971:1-3: WARNING: PTR_ERR_OR_ZERO can be used

Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Signed-off-by: Vasyl Gomonovych <gomonovych@gmail.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-its.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 8e633bd9cc1e7..465095355666e 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -1034,10 +1034,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 
 	device = vgic_its_alloc_device(its, device_id, itt_addr,
 				       num_eventid_bits);
-	if (IS_ERR(device))
-		return PTR_ERR(device);
 
-	return 0;
+	return PTR_ERR_OR_ZERO(device);
 }
 
 /*

From 5a24575032971c5a9a4580417a791c427ebdb8e5 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 8 Sep 2017 07:07:13 -0700
Subject: [PATCH 085/676] KVM: arm/arm64: Remove redundant preemptible checks

The __this_cpu_read() and __this_cpu_write() functions already implement
checks for the required preemption levels when using
CONFIG_DEBUG_PREEMPT which gives you nice error messages and such.
Therefore there is no need to explicitly check this using a BUG_ON() in
the code (which we don't do for other uses of per cpu variables either).

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 6b60c98a6e229..3610e132df8bd 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -71,7 +71,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
 
 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 {
-	BUG_ON(preemptible());
 	__this_cpu_write(kvm_arm_running_vcpu, vcpu);
 }
 
@@ -81,7 +80,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
  */
 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
 {
-	BUG_ON(preemptible());
 	return __this_cpu_read(kvm_arm_running_vcpu);
 }
 

From 6c1b7521f4a07cc63bbe2dfe290efed47cdb780a Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 14 Sep 2017 11:08:45 -0700
Subject: [PATCH 086/676] KVM: arm/arm64: Factor out functionality to get vgic
 mmio requester_vcpu

We are about to distinguish between userspace accesses and mmio traps
for a number of the mmio handlers.  When the requester vcpu is NULL, it
means we are handling a userspace access.

Factor out the functionality to get the request vcpu into its own
function, mostly so we have a common place to document the semantics of
the return value.

Also take the chance to move the functionality outside of holding a
spinlock and instead explicitly disable and enable preemption.  This
supports PREEMPT_RT kernels as well.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-mmio.c | 44 ++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index deb51ee16a3da..fdad95f62fa3b 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -122,6 +122,27 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
 	return value;
 }
 
+/*
+ * This function will return the VCPU that performed the MMIO access and
+ * trapped from within the VM, and will return NULL if this is a userspace
+ * access.
+ *
+ * We can disable preemption locally around accessing the per-CPU variable,
+ * and use the resolved vcpu pointer after enabling preemption again, because
+ * even if the current thread is migrated to another CPU, reading the per-CPU
+ * value later will give us the same value as we update the per-CPU variable
+ * in the preempt notifier handlers.
+ */
+static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	preempt_disable();
+	vcpu = kvm_arm_get_running_vcpu();
+	preempt_enable();
+	return vcpu;
+}
+
 void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val)
@@ -184,24 +205,10 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 				    bool new_active_state)
 {
-	struct kvm_vcpu *requester_vcpu;
 	unsigned long flags;
-	spin_lock_irqsave(&irq->irq_lock, flags);
+	struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu();
 
-	/*
-	 * The vcpu parameter here can mean multiple things depending on how
-	 * this function is called; when handling a trap from the kernel it
-	 * depends on the GIC version, and these functions are also called as
-	 * part of save/restore from userspace.
-	 *
-	 * Therefore, we have to figure out the requester in a reliable way.
-	 *
-	 * When accessing VGIC state from user space, the requester_vcpu is
-	 * NULL, which is fine, because we guarantee that no VCPUs are running
-	 * when accessing VGIC state from user space so irq->vcpu->cpu is
-	 * always -1.
-	 */
-	requester_vcpu = kvm_arm_get_running_vcpu();
+	spin_lock_irqsave(&irq->irq_lock, flags);
 
 	/*
 	 * If this virtual IRQ was written into a list register, we
@@ -213,6 +220,11 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 	 * vgic_change_active_prepare)  and still has to sync back this IRQ,
 	 * so we release and re-acquire the spin_lock to let the other thread
 	 * sync back the IRQ.
+	 *
+	 * When accessing VGIC state from user space, requester_vcpu is
+	 * NULL, which is fine, because we guarantee that no VCPUs are running
+	 * when accessing VGIC state from user space so irq->vcpu->cpu is
+	 * always -1.
 	 */
 	while (irq->vcpu && /* IRQ may have state in an LR somewhere */
 	       irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */

From 70450a9fbe0658e864f882d4351e5bae018b2647 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 4 Sep 2017 11:56:37 +0200
Subject: [PATCH 087/676] KVM: arm/arm64: Don't cache the timer IRQ level

The timer logic was designed after a strict idea of modeling an
interrupt line level in software, meaning that only transitions in the
level need to be reported to the VGIC.  This works well for the timer,
because the arch timer code is in complete control of the device and can
track the transitions of the line.

However, as we are about to support using the HW bit in the VGIC not
just for the timer, but also for VFIO which cannot track transitions of
the interrupt line, we have to decide on an interface between the GIC
and other subsystems for level triggered mapped interrupts, which both
the timer and VFIO can use.

VFIO only sees an asserting transition of the physical interrupt line,
and tells the VGIC when that happens.  That means that part of the
interrupt flow is offloaded to the hardware.

To use the same interface for VFIO devices and the timer, we therefore
have to change the timer (we cannot change VFIO because it doesn't know
the details of the device it is assigning to a VM).

Luckily, changing the timer is simple, we just need to stop 'caching'
the line level, but instead let the VGIC know the state of the timer
every time there is a potential change in the line level, and when the
line level should be asserted from the timer ISR.  The VGIC can ignore
extra notifications using its validate mechanism.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Julien Thierry <julien.thierry@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index f9555b1e7f158..9376fe03bf2e9 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -99,11 +99,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 	}
 	vtimer = vcpu_vtimer(vcpu);
 
-	if (!vtimer->irq.level) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		if (kvm_timer_irq_can_fire(vtimer))
-			kvm_timer_update_irq(vcpu, true, vtimer);
-	}
+	vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+	if (kvm_timer_irq_can_fire(vtimer))
+		kvm_timer_update_irq(vcpu, true, vtimer);
 
 	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		kvm_vtimer_update_mask_user(vcpu);
@@ -324,12 +322,20 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+	bool level;
 
 	if (unlikely(!timer->enabled))
 		return;
 
-	if (kvm_timer_should_fire(vtimer) != vtimer->irq.level)
-		kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer);
+	/*
+	 * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
+	 * of its lifecycle is offloaded to the hardware, and we therefore may
+	 * not have lowered the irq.level value before having to signal a new
+	 * interrupt, but have to signal an interrupt every time the level is
+	 * asserted.
+	 */
+	level = kvm_timer_should_fire(vtimer);
+	kvm_timer_update_irq(vcpu, level, vtimer);
 
 	if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
 		kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);

From e40cc57bac792713ff6a1b80a1f67b54c05e5e21 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 29 Aug 2017 10:40:44 +0200
Subject: [PATCH 088/676] KVM: arm/arm64: vgic: Support level-triggered mapped
 interrupts

Level-triggered mapped IRQs are special because we only observe rising
edges as input to the VGIC, and we don't set the EOI flag and therefore
are not told when the level goes down, so that we can re-queue a new
interrupt when the level goes up.

One way to solve this problem is to side-step the logic of the VGIC and
special case the validation in the injection path, but it has the
unfortunate drawback of having to peak into the physical GIC state
whenever we want to know if the interrupt is pending on the virtual
distributor.

Instead, we can maintain the current semantics of a level triggered
interrupt by sort of treating it as an edge-triggered interrupt,
following from the fact that we only observe an asserting edge.  This
requires us to be a bit careful when populating the LRs and when folding
the state back in though:

 * We lower the line level when populating the LR, so that when
   subsequently observing an asserting edge, the VGIC will do the right
   thing.

 * If the guest never acked the interrupt while running (for example if
   it had masked interrupts at the CPU level while running), we have
   to preserve the pending state of the LR and move it back to the
   line_level field of the struct irq when folding LR state.

   If the guest never acked the interrupt while running, but changed the
   device state and lowered the line (again with interrupts masked) then
   we need to observe this change in the line_level.

   Both of the above situations are solved by sampling the physical line
   and set the line level when folding the LR back.

 * Finally, if the guest never acked the interrupt while running and
   sampling the line reveals that the device state has changed and the
   line has been lowered, we must clear the physical active state, since
   we will otherwise never be told when the interrupt becomes asserted
   again.

This has the added benefit of making the timer optimization patches
(https://lists.cs.columbia.edu/pipermail/kvmarm/2017-July/026343.html) a
bit simpler, because the timer code doesn't have to clear the active
state on the sync anymore.  It also potentially improves the performance
of the timer implementation because the GIC knows the state or the LR
and only needs to clear the
active state when the pending bit in the LR is still set, where the
timer has to always clear it when returning from running the guest with
an injected timer interrupt.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-v2.c | 29 +++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic-v3.c | 29 +++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.c    | 23 +++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h    |  7 +++++++
 4 files changed, 88 insertions(+)

diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 80897102da26c..c32d7b93ffd19 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -105,6 +105,26 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
 				irq->pending_latch = false;
 		}
 
+		/*
+		 * Level-triggered mapped IRQs are special because we only
+		 * observe rising edges as input to the VGIC.
+		 *
+		 * If the guest never acked the interrupt we have to sample
+		 * the physical line and set the line level, because the
+		 * device state could have changed or we simply need to
+		 * process the still pending interrupt later.
+		 *
+		 * If this causes us to lower the level, we have to also clear
+		 * the physical active state, since we will otherwise never be
+		 * told when the interrupt becomes asserted again.
+		 */
+		if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
+			irq->line_level = vgic_get_phys_line_level(irq);
+
+			if (!irq->line_level)
+				vgic_irq_set_phys_active(irq, false);
+		}
+
 		spin_unlock_irqrestore(&irq->irq_lock, flags);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
@@ -162,6 +182,15 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 			val |= GICH_LR_EOI;
 	}
 
+	/*
+	 * Level-triggered mapped IRQs are special because we only observe
+	 * rising edges as input to the VGIC.  We therefore lower the line
+	 * level here, so that we can take new virtual IRQs.  See
+	 * vgic_v2_fold_lr_state for more info.
+	 */
+	if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
+		irq->line_level = false;
+
 	/* The GICv2 LR only holds five bits of priority. */
 	val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
 
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index f47e8481fa452..6b329414e57a3 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -96,6 +96,26 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
 				irq->pending_latch = false;
 		}
 
+		/*
+		 * Level-triggered mapped IRQs are special because we only
+		 * observe rising edges as input to the VGIC.
+		 *
+		 * If the guest never acked the interrupt we have to sample
+		 * the physical line and set the line level, because the
+		 * device state could have changed or we simply need to
+		 * process the still pending interrupt later.
+		 *
+		 * If this causes us to lower the level, we have to also clear
+		 * the physical active state, since we will otherwise never be
+		 * told when the interrupt becomes asserted again.
+		 */
+		if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
+			irq->line_level = vgic_get_phys_line_level(irq);
+
+			if (!irq->line_level)
+				vgic_irq_set_phys_active(irq, false);
+		}
+
 		spin_unlock_irqrestore(&irq->irq_lock, flags);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
@@ -145,6 +165,15 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
 			val |= ICH_LR_EOI;
 	}
 
+	/*
+	 * Level-triggered mapped IRQs are special because we only observe
+	 * rising edges as input to the VGIC.  We therefore lower the line
+	 * level here, so that we can take new virtual IRQs.  See
+	 * vgic_v3_fold_lr_state for more info.
+	 */
+	if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
+		irq->line_level = false;
+
 	/*
 	 * We currently only support Group1 interrupts, which is a
 	 * known defect. This needs to be addressed at some point.
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index ecb8e25f5fe56..4318e9edd075d 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -144,6 +144,29 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 	kfree(irq);
 }
 
+/* Get the input level of a mapped IRQ directly from the physical GIC */
+bool vgic_get_phys_line_level(struct vgic_irq *irq)
+{
+	bool line_level;
+
+	BUG_ON(!irq->hw);
+
+	WARN_ON(irq_get_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_PENDING,
+				      &line_level));
+	return line_level;
+}
+
+/* Set/Clear the physical active state */
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
+{
+
+	BUG_ON(!irq->hw);
+	WARN_ON(irq_set_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_ACTIVE,
+				      active));
+}
+
 /**
  * kvm_vgic_target_oracle - compute the target vcpu for an irq
  *
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index efbcf8f96f9c1..d0787983a3573 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -104,6 +104,11 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
 		return irq->pending_latch || irq->line_level;
 }
 
+static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
+{
+	return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
+}
+
 /*
  * This struct provides an intermediate representation of the fields contained
  * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
@@ -140,6 +145,8 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
 struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 			   unsigned long flags);
 void vgic_kick_vcpus(struct kvm *kvm);

From b6909a659f8d5de2df36cabb1c0505d262996a24 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 27 Oct 2017 19:30:09 +0200
Subject: [PATCH 089/676] KVM: arm/arm64: Support a vgic interrupt line level
 sample function

The GIC sometimes need to sample the physical line of a mapped
interrupt.  As we know this to be notoriously slow, provide a callback
function for devices (such as the timer) which can do this much faster
than talking to the distributor, for example by comparing a few
in-memory values.  Fall back to the good old method of poking the
physical GIC if no callback is provided.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_vgic.h    | 13 ++++++++++++-
 virt/kvm/arm/arch_timer.c |  3 ++-
 virt/kvm/arm/vgic/vgic.c  | 13 +++++++++----
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 8c896540a72cf..cdbd142ca7f2e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -130,6 +130,17 @@ struct vgic_irq {
 	u8 priority;
 	enum vgic_irq_config config;	/* Level or edge */
 
+	/*
+	 * Callback function pointer to in-kernel devices that can tell us the
+	 * state of the input level of mapped level-triggered IRQ faster than
+	 * peaking into the physical GIC.
+	 *
+	 * Always called in non-preemptible section and the functions can use
+	 * kvm_arm_get_running_vcpu() to get the vcpu pointer for private
+	 * IRQs.
+	 */
+	bool (*get_input_level)(int vintid);
+
 	void *owner;			/* Opaque pointer to reserve an interrupt
 					   for in-kernel devices. */
 };
@@ -331,7 +342,7 @@ void kvm_vgic_init_cpu_hardware(void);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 			bool level, void *owner);
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-			  u32 vintid);
+			  u32 vintid, bool (*get_input_level)(int vindid));
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid);
 bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid);
 
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 9376fe03bf2e9..fb0533ed903d4 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -834,7 +834,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
-	ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq);
+	ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
+				    NULL);
 	if (ret)
 		return ret;
 
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 4318e9edd075d..d57b3bf8eb362 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -144,13 +144,15 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 	kfree(irq);
 }
 
-/* Get the input level of a mapped IRQ directly from the physical GIC */
 bool vgic_get_phys_line_level(struct vgic_irq *irq)
 {
 	bool line_level;
 
 	BUG_ON(!irq->hw);
 
+	if (irq->get_input_level)
+		return irq->get_input_level(irq->intid);
+
 	WARN_ON(irq_get_irqchip_state(irq->host_irq,
 				      IRQCHIP_STATE_PENDING,
 				      &line_level));
@@ -436,7 +438,8 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
 
 /* @irq->irq_lock must be held */
 static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-			    unsigned int host_irq)
+			    unsigned int host_irq,
+			    bool (*get_input_level)(int vindid))
 {
 	struct irq_desc *desc;
 	struct irq_data *data;
@@ -456,6 +459,7 @@ static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 	irq->hw = true;
 	irq->host_irq = host_irq;
 	irq->hwintid = data->hwirq;
+	irq->get_input_level = get_input_level;
 	return 0;
 }
 
@@ -464,10 +468,11 @@ static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
 {
 	irq->hw = false;
 	irq->hwintid = 0;
+	irq->get_input_level = NULL;
 }
 
 int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
-			  u32 vintid)
+			  u32 vintid, bool (*get_input_level)(int vindid))
 {
 	struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
 	unsigned long flags;
@@ -476,7 +481,7 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
 	BUG_ON(!irq);
 
 	spin_lock_irqsave(&irq->irq_lock, flags);
-	ret = kvm_vgic_map_irq(vcpu, irq, host_irq);
+	ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
 	spin_unlock_irqrestore(&irq->irq_lock, flags);
 	vgic_put_irq(vcpu->kvm, irq);
 

From df635c5b184da05145db1d63e1dcb1c853f425c2 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 1 Sep 2017 16:25:12 +0200
Subject: [PATCH 090/676] KVM: arm/arm64: Support VGIC dist pend/active changes
 for mapped IRQs

For mapped IRQs (with the HW bit set in the LR) we have to follow some
rules of the architecture.  One of these rules is that VM must not be
allowed to deactivate a virtual interrupt with the HW bit set unless the
physical interrupt is also active.

This works fine when injecting mapped interrupts, because we leave it up
to the injector to either set EOImode==1 or manually set the active
state of the physical interrupt.

However, the guest can set virtual interrupt to be pending or active by
writing to the virtual distributor, which could lead to deactivating a
virtual interrupt with the HW bit set without the physical interrupt
being active.

We could set the physical interrupt to active whenever we are about to
enter the VM with a HW interrupt either pending or active, but that
would be really slow, especially on GICv2.  So we take the long way
around and do the hard work when needed, which is expected to be
extremely rare.

When the VM sets the pending state for a HW interrupt on the virtual
distributor we set the active state on the physical distributor, because
the virtual interrupt can become active and then the guest can
deactivate it.

When the VM clears the pending state we also clear it on the physical
side, because the injector might otherwise raise the interrupt.  We also
clear the physical active state when the virtual interrupt is not
active, since otherwise a SPEND/CPEND sequence from the guest would
prevent signaling of future interrupts.

Changing the state of mapped interrupts from userspace is not supported,
and it's expected that userspace unmaps devices from VFIO before
attempting to set the interrupt state, because the interrupt state is
driven by hardware.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/vgic/vgic-mmio.c | 71 ++++++++++++++++++++++++++++++++---
 virt/kvm/arm/vgic/vgic.c      |  7 ++++
 virt/kvm/arm/vgic/vgic.h      |  1 +
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index fdad95f62fa3b..83d82bd7dc4e7 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -16,6 +16,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <kvm/iodev.h>
+#include <kvm/arm_arch_timer.h>
 #include <kvm/arm_vgic.h>
 
 #include "vgic.h"
@@ -143,10 +144,22 @@ static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void)
 	return vcpu;
 }
 
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+				 bool is_uaccess)
+{
+	if (is_uaccess)
+		return;
+
+	irq->pending_latch = true;
+	vgic_irq_set_phys_active(irq, true);
+}
+
 void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val)
 {
+	bool is_uaccess = !vgic_get_mmio_requester_vcpu();
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
 	int i;
 	unsigned long flags;
@@ -155,17 +168,45 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
 		struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
 
 		spin_lock_irqsave(&irq->irq_lock, flags);
-		irq->pending_latch = true;
-
+		if (irq->hw)
+			vgic_hw_irq_spending(vcpu, irq, is_uaccess);
+		else
+			irq->pending_latch = true;
 		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
 		vgic_put_irq(vcpu->kvm, irq);
 	}
 }
 
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+				 bool is_uaccess)
+{
+	if (is_uaccess)
+		return;
+
+	irq->pending_latch = false;
+
+	/*
+	 * We don't want the guest to effectively mask the physical
+	 * interrupt by doing a write to SPENDR followed by a write to
+	 * CPENDR for HW interrupts, so we clear the active state on
+	 * the physical side if the virtual interrupt is not active.
+	 * This may lead to taking an additional interrupt on the
+	 * host, but that should not be a problem as the worst that
+	 * can happen is an additional vgic injection.  We also clear
+	 * the pending state to maintain proper semantics for edge HW
+	 * interrupts.
+	 */
+	vgic_irq_set_phys_pending(irq, false);
+	if (!irq->active)
+		vgic_irq_set_phys_active(irq, false);
+}
+
 void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 			      gpa_t addr, unsigned int len,
 			      unsigned long val)
 {
+	bool is_uaccess = !vgic_get_mmio_requester_vcpu();
 	u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
 	int i;
 	unsigned long flags;
@@ -175,7 +216,10 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
 
 		spin_lock_irqsave(&irq->irq_lock, flags);
 
-		irq->pending_latch = false;
+		if (irq->hw)
+			vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
+		else
+			irq->pending_latch = false;
 
 		spin_unlock_irqrestore(&irq->irq_lock, flags);
 		vgic_put_irq(vcpu->kvm, irq);
@@ -202,8 +246,19 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
 	return value;
 }
 
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+				      bool active, bool is_uaccess)
+{
+	if (is_uaccess)
+		return;
+
+	irq->active = active;
+	vgic_irq_set_phys_active(irq, active);
+}
+
 static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
-				    bool new_active_state)
+				    bool active)
 {
 	unsigned long flags;
 	struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu();
@@ -231,8 +286,12 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
 	       irq->vcpu->cpu != -1) /* VCPU thread is running */
 		cond_resched_lock(&irq->irq_lock);
 
-	irq->active = new_active_state;
-	if (new_active_state)
+	if (irq->hw)
+		vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
+	else
+		irq->active = active;
+
+	if (irq->active)
 		vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
 	else
 		spin_unlock_irqrestore(&irq->irq_lock, flags);
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index d57b3bf8eb362..c7c5ef190afa0 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -144,6 +144,13 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 	kfree(irq);
 }
 
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
+{
+	WARN_ON(irq_set_irqchip_state(irq->host_irq,
+				      IRQCHIP_STATE_PENDING,
+				      pending));
+}
+
 bool vgic_get_phys_line_level(struct vgic_irq *irq)
 {
 	bool line_level;
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index d0787983a3573..12c37b89f7a38 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -146,6 +146,7 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
 			      u32 intid);
 void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
 bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
 void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
 bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
 			   unsigned long flags);

From 4c60e360d6dfa4d9c3586b687f348eeb3fd675dd Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 27 Oct 2017 19:34:30 +0200
Subject: [PATCH 091/676] KVM: arm/arm64: Provide a get_input_level for the
 arch timer

The VGIC can now support the life-cycle of mapped level-triggered
interrupts, and we no longer have to read back the timer state on every
exit from the VM if we had an asserted timer interrupt signal, because
the VGIC already knows if we hit the unlikely case where the guest
disables the timer without ACKing the virtual timer interrupt.

This means we rework a bit of the code to factor out the functionality
to snapshot the timer state from vtimer_save_state(), and we can reuse
this functionality in the sync path when we have an irqchip in
userspace, and also to support our implementation of the
get_input_level() function for the timer.

This change also means that we can no longer rely on the timer's view of
the interrupt line to set the active state, because we no longer
maintain this state for mapped interrupts when exiting from the guest.
Instead, we only set the active state if the virtual interrupt is
active, and otherwise we simply let the timer fire again and raise the
virtual interrupt from the ISR.

Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 include/kvm/arm_arch_timer.h |  2 +
 virt/kvm/arm/arch_timer.c    | 84 ++++++++++++++++--------------------
 2 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index 6e45608b23998..b1dcfde0a3ef1 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -90,6 +90,8 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu);
 
 void kvm_timer_init_vhe(void);
 
+bool kvm_arch_timer_get_input_level(int vintid);
+
 #define vcpu_vtimer(v)	(&(v)->arch.timer_cpu.vtimer)
 #define vcpu_ptimer(v)	(&(v)->arch.timer_cpu.ptimer)
 
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index fb0533ed903d4..d845d67b7062f 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -343,6 +343,12 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
 	phys_timer_emulate(vcpu);
 }
 
+static void __timer_snapshot_state(struct arch_timer_context *timer)
+{
+	timer->cnt_ctl = read_sysreg_el0(cntv_ctl);
+	timer->cnt_cval = read_sysreg_el0(cntv_cval);
+}
+
 static void vtimer_save_state(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -354,10 +360,8 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu)
 	if (!vtimer->loaded)
 		goto out;
 
-	if (timer->enabled) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-	}
+	if (timer->enabled)
+		__timer_snapshot_state(vtimer);
 
 	/* Disable the virtual timer */
 	write_sysreg_el0(0, cntv_ctl);
@@ -454,8 +458,7 @@ static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu)
 	bool phys_active;
 	int ret;
 
-	phys_active = vtimer->irq.level ||
-		      kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
+	phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
 
 	ret = irq_set_irqchip_state(host_vtimer_irq,
 				    IRQCHIP_STATE_ACTIVE,
@@ -535,54 +538,27 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	set_cntvoff(0);
 }
 
-static void unmask_vtimer_irq(struct kvm_vcpu *vcpu)
+/*
+ * With a userspace irqchip we have to check if the guest de-asserted the
+ * timer and if so, unmask the timer irq signal on the host interrupt
+ * controller to ensure that we see future timer signals.
+ */
+static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 
 	if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
-		kvm_vtimer_update_mask_user(vcpu);
-		return;
-	}
-
-	/*
-	 * If the guest disabled the timer without acking the interrupt, then
-	 * we must make sure the physical and virtual active states are in
-	 * sync by deactivating the physical interrupt, because otherwise we
-	 * wouldn't see the next timer interrupt in the host.
-	 */
-	if (!kvm_vgic_map_is_active(vcpu, vtimer->irq.irq)) {
-		int ret;
-		ret = irq_set_irqchip_state(host_vtimer_irq,
-					    IRQCHIP_STATE_ACTIVE,
-					    false);
-		WARN_ON(ret);
+		__timer_snapshot_state(vtimer);
+		if (!kvm_timer_should_fire(vtimer)) {
+			kvm_timer_update_irq(vcpu, false, vtimer);
+			kvm_vtimer_update_mask_user(vcpu);
+		}
 	}
 }
 
-/**
- * kvm_timer_sync_hwstate - sync timer state from cpu
- * @vcpu: The vcpu pointer
- *
- * Check if any of the timers have expired while we were running in the guest,
- * and inject an interrupt if that was the case.
- */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
-	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
-	/*
-	 * If we entered the guest with the vtimer output asserted we have to
-	 * check if the guest has modified the timer so that we should lower
-	 * the line at this point.
-	 */
-	if (vtimer->irq.level) {
-		vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-		vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
-		if (!kvm_timer_should_fire(vtimer)) {
-			kvm_timer_update_irq(vcpu, false, vtimer);
-			unmask_vtimer_irq(vcpu);
-		}
-	}
+	unmask_vtimer_irq_user(vcpu);
 }
 
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -813,6 +789,22 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+bool kvm_arch_timer_get_input_level(int vintid)
+{
+	struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu();
+	struct arch_timer_context *timer;
+
+	if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+		timer = vcpu_vtimer(vcpu);
+	else
+		BUG(); /* We only map the vtimer so far */
+
+	if (timer->loaded)
+		__timer_snapshot_state(timer);
+
+	return kvm_timer_should_fire(timer);
+}
+
 int kvm_timer_enable(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
@@ -835,7 +827,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
 	}
 
 	ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
-				    NULL);
+				    kvm_arch_timer_get_input_level);
 	if (ret)
 		return ret;
 

From 61bbe38027334780964e4b2658f722ed0b3cb2f9 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 27 Oct 2017 19:57:51 +0200
Subject: [PATCH 092/676] KVM: arm/arm64: Avoid work when userspace iqchips are
 not used

We currently check if the VM has a userspace irqchip in several places
along the critical path, and if so, we do some work which is only
required for having an irqchip in userspace.  This is unfortunate, as we
could avoid doing any work entirely, if we didn't have to support
irqchip in userspace.

Realizing the userspace irqchip on ARM is mostly a developer or hobby
feature, and is unlikely to be used in servers or other scenarios where
performance is a priority, we can use a refcounted static key to only
check the irqchip configuration when we have at least one VM that uses
an irqchip in userspace.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_host.h   |  2 ++
 arch/arm64/include/asm/kvm_host.h |  2 ++
 virt/kvm/arm/arch_timer.c         |  6 ++--
 virt/kvm/arm/arm.c                | 59 ++++++++++++++++++++++---------
 4 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index a9f7d3f47134a..6394fb99da7f0 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -48,6 +48,8 @@
 	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
 
+DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ea6cb5b24258b..e7218cf7df2a2 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -47,6 +47,8 @@
 	KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_IRQ_PENDING	KVM_ARCH_REQ(1)
 
+DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index d845d67b7062f..cfcd0323deabe 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -103,7 +103,8 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 	if (kvm_timer_irq_can_fire(vtimer))
 		kvm_timer_update_irq(vcpu, true, vtimer);
 
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+	if (static_branch_unlikely(&userspace_irqchip_in_use) &&
+	    unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		kvm_vtimer_update_mask_user(vcpu);
 
 	return IRQ_HANDLED;
@@ -284,7 +285,8 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
 				   timer_ctx->irq.level);
 
-	if (likely(irqchip_in_kernel(vcpu->kvm))) {
+	if (!static_branch_unlikely(&userspace_irqchip_in_use) &&
+	    likely(irqchip_in_kernel(vcpu->kvm))) {
 		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
 					  timer_ctx->irq.irq,
 					  timer_ctx->irq.level,
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 3610e132df8bd..59d8e04c19fa2 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -74,6 +74,8 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 	__this_cpu_write(kvm_arm_running_vcpu, vcpu);
 }
 
+DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
 /**
  * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU.
  * Must be called from non-preemptible context
@@ -302,6 +304,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		static_branch_dec(&userspace_irqchip_in_use);
 	kvm_arch_vcpu_free(vcpu);
 }
 
@@ -522,14 +526,22 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.has_run_once = true;
 
-	/*
-	 * Map the VGIC hardware resources before running a vcpu the first
-	 * time on this VM.
-	 */
-	if (unlikely(irqchip_in_kernel(kvm) && !vgic_ready(kvm))) {
-		ret = kvm_vgic_map_resources(kvm);
-		if (ret)
-			return ret;
+	if (likely(irqchip_in_kernel(kvm))) {
+		/*
+		 * Map the VGIC hardware resources before running a vcpu the
+		 * first time on this VM.
+		 */
+		if (unlikely(!vgic_ready(kvm))) {
+			ret = kvm_vgic_map_resources(kvm);
+			if (ret)
+				return ret;
+		}
+	} else {
+		/*
+		 * Tell the rest of the code that there are userspace irqchip
+		 * VMs in the wild.
+		 */
+		static_branch_inc(&userspace_irqchip_in_use);
 	}
 
 	ret = kvm_timer_enable(vcpu);
@@ -664,18 +676,29 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_vgic_flush_hwstate(vcpu);
 
 		/*
-		 * If we have a singal pending, or need to notify a userspace
-		 * irqchip about timer or PMU level changes, then we exit (and
-		 * update the timer level state in kvm_timer_update_run
-		 * below).
+		 * Exit if we have a signal pending so that we can deliver the
+		 * signal to user space.
 		 */
-		if (signal_pending(current) ||
-		    kvm_timer_should_notify_user(vcpu) ||
-		    kvm_pmu_should_notify_user(vcpu)) {
+		if (signal_pending(current)) {
 			ret = -EINTR;
 			run->exit_reason = KVM_EXIT_INTR;
 		}
 
+		/*
+		 * If we're using a userspace irqchip, then check if we need
+		 * to tell a userspace irqchip about timer or PMU level
+		 * changes and if so, exit to userspace (the actual level
+		 * state gets updated in kvm_timer_update_run and
+		 * kvm_pmu_update_run below).
+		 */
+		if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+			if (kvm_timer_should_notify_user(vcpu) ||
+			    kvm_pmu_should_notify_user(vcpu)) {
+				ret = -EINTR;
+				run->exit_reason = KVM_EXIT_INTR;
+			}
+		}
+
 		/*
 		 * Ensure we set mode to IN_GUEST_MODE after we disable
 		 * interrupts and before the final VCPU requests check.
@@ -688,7 +711,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		    kvm_request_pending(vcpu)) {
 			vcpu->mode = OUTSIDE_GUEST_MODE;
 			kvm_pmu_sync_hwstate(vcpu);
-			kvm_timer_sync_hwstate(vcpu);
+			if (static_branch_unlikely(&userspace_irqchip_in_use))
+				kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			local_irq_enable();
 			preempt_enable();
@@ -732,7 +756,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * we don't want vtimer interrupts to race with syncing the
 		 * timer virtual interrupt state.
 		 */
-		kvm_timer_sync_hwstate(vcpu);
+		if (static_branch_unlikely(&userspace_irqchip_in_use))
+			kvm_timer_sync_hwstate(vcpu);
 
 		/*
 		 * We may have taken a host interrupt in HYP mode (ie

From e5a2cfb492d5f25f96fa06170dba7494355c5969 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 12 Dec 2017 21:37:21 +0100
Subject: [PATCH 093/676] KVM: arm/arm64: Delete outdated forwarded irq
 documentation

The reason I added this documentation originally was that the concept of
"never taking the interrupt", but just use the timer to generate an exit
from the guest, was confusing to most, and we had to explain it several
times over.  But as we can clearly see, we've failed to update the
documentation as the code has evolved, and people who need to understand
these details are probably better off reading the code.

Let's lighten our maintenance burden slightly and just get rid of this.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 .../virtual/kvm/arm/vgic-mapped-irqs.txt      | 187 ------------------
 1 file changed, 187 deletions(-)
 delete mode 100644 Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt

diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
deleted file mode 100644
index 38bca2835278c..0000000000000
--- a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-KVM/ARM VGIC Forwarded Physical Interrupts
-==========================================
-
-The KVM/ARM code implements software support for the ARM Generic
-Interrupt Controller's (GIC's) hardware support for virtualization by
-allowing software to inject virtual interrupts to a VM, which the guest
-OS sees as regular interrupts.  The code is famously known as the VGIC.
-
-Some of these virtual interrupts, however, correspond to physical
-interrupts from real physical devices.  One example could be the
-architected timer, which itself supports virtualization, and therefore
-lets a guest OS program the hardware device directly to raise an
-interrupt at some point in time.  When such an interrupt is raised, the
-host OS initially handles the interrupt and must somehow signal this
-event as a virtual interrupt to the guest.  Another example could be a
-passthrough device, where the physical interrupts are initially handled
-by the host, but the device driver for the device lives in the guest OS
-and KVM must therefore somehow inject a virtual interrupt on behalf of
-the physical one to the guest OS.
-
-These virtual interrupts corresponding to a physical interrupt on the
-host are called forwarded physical interrupts, but are also sometimes
-referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
-
-Forwarded physical interrupts are handled slightly differently compared
-to virtual interrupts generated purely by a software emulated device.
-
-
-The HW bit
-----------
-Virtual interrupts are signalled to the guest by programming the List
-Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
-with the virtual IRQ number and the state of the interrupt (Pending,
-Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
-interrupt, the LR state moves from Pending to Active, and finally to
-inactive.
-
-The LRs include an extra bit, called the HW bit.  When this bit is set,
-KVM must also program an additional field in the LR, the physical IRQ
-number, to link the virtual with the physical IRQ.
-
-When the HW bit is set, KVM must EITHER set the Pending OR the Active
-bit, never both at the same time.
-
-Setting the HW bit causes the hardware to deactivate the physical
-interrupt on the physical distributor when the guest deactivates the
-corresponding virtual interrupt.
-
-
-Forwarded Physical Interrupts Life Cycle
-----------------------------------------
-
-The state of forwarded physical interrupts is managed in the following way:
-
-  - The physical interrupt is acked by the host, and becomes active on
-    the physical distributor (*).
-  - KVM sets the LR.Pending bit, because this is the only way the GICV
-    interface is going to present it to the guest.
-  - LR.Pending will stay set as long as the guest has not acked the interrupt.
-  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
-    expected.
-  - On guest EOI, the *physical distributor* active bit gets cleared,
-    but the LR.Active is left untouched (set).
-  - KVM clears the LR on VM exits when the physical distributor
-    active state has been cleared.
-
-(*): The host handling is slightly more complicated.  For some forwarded
-interrupts (shared), KVM directly sets the active state on the physical
-distributor before entering the guest, because the interrupt is never actually
-handled on the host (see details on the timer as an example below).  For other
-forwarded interrupts (non-shared) the host does not deactivate the interrupt
-when the host ISR completes, but leaves the interrupt active until the guest
-deactivates it.  Leaving the interrupt active is allowed, because Linux
-configures the physical GIC with EOIMode=1, which causes EOI operations to
-perform a priority drop allowing the GIC to receive other interrupts of the
-default priority.
-
-
-Forwarded Edge and Level Triggered PPIs and SPIs
-------------------------------------------------
-Forwarded physical interrupts injected should always be active on the
-physical distributor when injected to a guest.
-
-Level-triggered interrupts will keep the interrupt line to the GIC
-asserted, typically until the guest programs the device to deassert the
-line.  This means that the interrupt will remain pending on the physical
-distributor until the guest has reprogrammed the device.  Since we
-always run the VM with interrupts enabled on the CPU, a pending
-interrupt will exit the guest as soon as we switch into the guest,
-preventing the guest from ever making progress as the process repeats
-over and over.  Therefore, the active state on the physical distributor
-must be set when entering the guest, preventing the GIC from forwarding
-the pending interrupt to the CPU.  As soon as the guest deactivates the
-interrupt, the physical line is sampled by the hardware again and the host
-takes a new interrupt if and only if the physical line is still asserted.
-
-Edge-triggered interrupts do not exhibit the same problem with
-preventing guest execution that level-triggered interrupts do.  One
-option is to not use HW bit at all, and inject edge-triggered interrupts
-from a physical device as pure virtual interrupts.  But that would
-potentially slow down handling of the interrupt in the guest, because a
-physical interrupt occurring in the middle of the guest ISR would
-preempt the guest for the host to handle the interrupt.  Additionally,
-if you configure the system to handle interrupts on a separate physical
-core from that running your VCPU, you still have to interrupt the VCPU
-to queue the pending state onto the LR, even though the guest won't use
-this information until the guest ISR completes.  Therefore, the HW
-bit should always be set for forwarded edge-triggered interrupts.  With
-the HW bit set, the virtual interrupt is injected and additional
-physical interrupts occurring before the guest deactivates the interrupt
-simply mark the state on the physical distributor as Pending+Active.  As
-soon as the guest deactivates the interrupt, the host takes another
-interrupt if and only if there was a physical interrupt between injecting
-the forwarded interrupt to the guest and the guest deactivating the
-interrupt.
-
-Consequently, whenever we schedule a VCPU with one or more LRs with the
-HW bit set, the interrupt must also be active on the physical
-distributor.
-
-
-Forwarded LPIs
---------------
-LPIs, introduced in GICv3, are always edge-triggered and do not have an
-active state.  They become pending when a device signal them, and as
-soon as they are acked by the CPU, they are inactive again.
-
-It therefore doesn't make sense, and is not supported, to set the HW bit
-for physical LPIs that are forwarded to a VM as virtual interrupts,
-typically virtual SPIs.
-
-For LPIs, there is no other choice than to preempt the VCPU thread if
-necessary, and queue the pending state onto the LR.
-
-
-Putting It Together: The Architected Timer
-------------------------------------------
-The architected timer is a device that signals interrupts with level
-triggered semantics.  The timer hardware is directly accessed by VCPUs
-which program the timer to fire at some point in time.  Each VCPU on a
-system programs the timer to fire at different times, and therefore the
-hardware is multiplexed between multiple VCPUs.  This is implemented by
-context-switching the timer state along with each VCPU thread.
-
-However, this means that a scenario like the following is entirely
-possible, and in fact, typical:
-
-1.  KVM runs the VCPU
-2.  The guest programs the time to fire in T+100
-3.  The guest is idle and calls WFI (wait-for-interrupts)
-4.  The hardware traps to the host
-5.  KVM stores the timer state to memory and disables the hardware timer
-6.  KVM schedules a soft timer to fire in T+(100 - time since step 2)
-7.  KVM puts the VCPU thread to sleep (on a waitqueue)
-8.  The soft timer fires, waking up the VCPU thread
-9.  KVM reprograms the timer hardware with the VCPU's values
-10. KVM marks the timer interrupt as active on the physical distributor
-11. KVM injects a forwarded physical interrupt to the guest
-12. KVM runs the VCPU
-
-Notice that KVM injects a forwarded physical interrupt in step 11 without
-the corresponding interrupt having actually fired on the host.  That is
-exactly why we mark the timer interrupt as active in step 10, because
-the active state on the physical distributor is part of the state
-belonging to the timer hardware, which is context-switched along with
-the VCPU thread.
-
-If the guest does not idle because it is busy, the flow looks like this
-instead:
-
-1.  KVM runs the VCPU
-2.  The guest programs the time to fire in T+100
-4.  At T+100 the timer fires and a physical IRQ causes the VM to exit
-    (note that this initially only traps to EL2 and does not run the host ISR
-    until KVM has returned to the host).
-5.  With interrupts still disabled on the CPU coming back from the guest, KVM
-    stores the virtual timer state to memory and disables the virtual hw timer.
-6.  KVM looks at the timer state (in memory) and injects a forwarded physical
-    interrupt because it concludes the timer has expired.
-7.  KVM marks the timer interrupt as active on the physical distributor
-7.  KVM enables the timer, enables interrupts, and runs the VCPU
-
-Notice that again the forwarded physical interrupt is injected to the
-guest without having actually been handled on the host.  In this case it
-is because the physical interrupt is never actually seen by the host because the
-timer is disabled upon guest return, and the virtual forwarded interrupt is
-injected on the KVM guest entry path.

From f3721c70fc2b1216db26ce8aabf23b4d7cc4a0a6 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Mon, 8 Jan 2018 15:19:31 +0100
Subject: [PATCH 094/676] Revert "arm64: KVM: Hide PMU from guests when
 disabled"

Commit 0c0543a128bd1c6a4c8610d0d9d869053fa2fbf5 breaks migration and
introduces a regression with existing userspace because it introduces an
ordering requirement of setting up all VCPU features before writing ID
registers which we didn't have before.

Revert this commit for now until we have a proper fix.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/kvm/sys_regs.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 503144f13af02..1830ebc227d18 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -881,25 +881,18 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu,
 }
 
 /* Read a sanitised cpufeature ID register by sys_reg_desc */
-static u64 read_id_reg(struct kvm_vcpu *vcpu,
-		       struct sys_reg_desc const *r,
-		       bool raz)
+static u64 read_id_reg(struct sys_reg_desc const *r, bool raz)
 {
 	u32 id = sys_reg((u32)r->Op0, (u32)r->Op1,
 			 (u32)r->CRn, (u32)r->CRm, (u32)r->Op2);
 	u64 val = raz ? 0 : read_sanitised_ftr_reg(id);
 
-	switch (id) {
-	case SYS_ID_AA64DFR0_EL1:
-		if (!kvm_arm_pmu_v3_ready(vcpu))
-			val &= ~(0xfUL << ID_AA64DFR0_PMUVER_SHIFT);
-		break;
-	case SYS_ID_AA64PFR0_EL1:
+	if (id == SYS_ID_AA64PFR0_EL1) {
 		if (val & (0xfUL << ID_AA64PFR0_SVE_SHIFT))
 			pr_err_once("kvm [%i]: SVE unsupported for guests, suppressing\n",
 				    task_pid_nr(current));
+
 		val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
-		break;
 	}
 
 	return val;
@@ -915,7 +908,7 @@ static bool __access_id_reg(struct kvm_vcpu *vcpu,
 	if (p->is_write)
 		return write_to_read_only(vcpu, p, r);
 
-	p->regval = read_id_reg(vcpu, r, raz);
+	p->regval = read_id_reg(r, raz);
 	return true;
 }
 
@@ -944,17 +937,17 @@ static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
  * are stored, and for set_id_reg() we don't allow the effective value
  * to be changed.
  */
-static int __get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
-			void __user *uaddr, bool raz)
+static int __get_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+			bool raz)
 {
 	const u64 id = sys_reg_to_index(rd);
-	const u64 val = read_id_reg(vcpu, rd, raz);
+	const u64 val = read_id_reg(rd, raz);
 
 	return reg_to_user(uaddr, &val, id);
 }
 
-static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
-			void __user *uaddr, bool raz)
+static int __set_id_reg(const struct sys_reg_desc *rd, void __user *uaddr,
+			bool raz)
 {
 	const u64 id = sys_reg_to_index(rd);
 	int err;
@@ -965,7 +958,7 @@ static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		return err;
 
 	/* This is what we mean by invariant: you can't change it. */
-	if (val != read_id_reg(vcpu, rd, raz))
+	if (val != read_id_reg(rd, raz))
 		return -EINVAL;
 
 	return 0;
@@ -974,25 +967,25 @@ static int __set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __get_id_reg(vcpu, rd, uaddr, false);
+	return __get_id_reg(rd, uaddr, false);
 }
 
 static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 		      const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __set_id_reg(vcpu, rd, uaddr, false);
+	return __set_id_reg(rd, uaddr, false);
 }
 
 static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 			  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __get_id_reg(vcpu, rd, uaddr, true);
+	return __get_id_reg(rd, uaddr, true);
 }
 
 static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 			  const struct kvm_one_reg *reg, void __user *uaddr)
 {
-	return __set_id_reg(vcpu, rd, uaddr, true);
+	return __set_id_reg(rd, uaddr, true);
 }
 
 /* sys_reg_desc initialiser for known cpufeature ID registers */

From d68119864ef4b253a585a1c897cda6936d4b5de9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:14 +0100
Subject: [PATCH 095/676] KVM: arm/arm64: Detangle kvm_mmu.h from kvm_hyp.h

kvm_hyp.h has an odd dependency on kvm_mmu.h, which makes the
opposite inclusion impossible. Let's start with breaking that
useless dependency.

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_hyp.h   | 1 -
 arch/arm/kvm/hyp/switch.c        | 1 +
 arch/arm/kvm/hyp/tlb.c           | 1 +
 arch/arm64/include/asm/kvm_hyp.h | 1 -
 arch/arm64/kvm/hyp/debug-sr.c    | 1 +
 arch/arm64/kvm/hyp/switch.c      | 1 +
 arch/arm64/kvm/hyp/tlb.c         | 1 +
 virt/kvm/arm/hyp/vgic-v2-sr.c    | 1 +
 8 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index ab20ffa8b9e76..76368de7237ba 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -21,7 +21,6 @@
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
 #include <asm/cp15.h>
-#include <asm/kvm_mmu.h>
 #include <asm/vfp.h>
 
 #define __hyp_text __section(.hyp.text) notrace
diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c
index 330c9ce34ba5f..ae45ae96aac28 100644
--- a/arch/arm/kvm/hyp/switch.c
+++ b/arch/arm/kvm/hyp/switch.c
@@ -18,6 +18,7 @@
 
 #include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 
 __asm__(".arch_extension     virt");
 
diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c
index 6d810af2d9fd7..c0edd450e1045 100644
--- a/arch/arm/kvm/hyp/tlb.c
+++ b/arch/arm/kvm/hyp/tlb.c
@@ -19,6 +19,7 @@
  */
 
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 
 /**
  * Flush per-VMID TLBs
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 08d3bb66c8b75..f26f9cd70c721 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -20,7 +20,6 @@
 
 #include <linux/compiler.h>
 #include <linux/kvm_host.h>
-#include <asm/kvm_mmu.h>
 #include <asm/sysreg.h>
 
 #define __hyp_text __section(.hyp.text) notrace
diff --git a/arch/arm64/kvm/hyp/debug-sr.c b/arch/arm64/kvm/hyp/debug-sr.c
index 321c9c05dd9e0..360455f863461 100644
--- a/arch/arm64/kvm/hyp/debug-sr.c
+++ b/arch/arm64/kvm/hyp/debug-sr.c
@@ -21,6 +21,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 
 #define read_debug(r,n)		read_sysreg(r##n##_el1)
 #define write_debug(v,r,n)	write_sysreg(v, r##n##_el1)
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index f7c651f3a8c0e..f3d8bed096f50 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -21,6 +21,7 @@
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 #include <asm/fpsimd.h>
 #include <asm/debug-monitors.h>
 
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 73464a96c3657..131c7772703c2 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -16,6 +16,7 @@
  */
 
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 #include <asm/tlbflush.h>
 
 static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index d7fd46fe9efb3..4fe6e797e8b3c 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -21,6 +21,7 @@
 
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
 
 static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
 {

From a15f693935a9f1fec8241cafaca27be4483d4464 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:15 +0100
Subject: [PATCH 096/676] KVM: arm/arm64: Split dcache/icache flushing

As we're about to introduce opportunistic invalidation of the icache,
let's split dcache and icache flushing.

Acked-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   | 60 ++++++++++++++++++++++----------
 arch/arm64/include/asm/kvm_mmu.h | 13 +++++--
 virt/kvm/arm/mmu.c               | 20 ++++++++---
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index fa6f2174276bd..9fa4b2520974d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -126,21 +126,12 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101;
 }
 
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
-					       kvm_pfn_t pfn,
-					       unsigned long size)
+static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu,
+					     kvm_pfn_t pfn,
+					     unsigned long size)
 {
 	/*
-	 * If we are going to insert an instruction page and the icache is
-	 * either VIPT or PIPT, there is a potential problem where the host
-	 * (or another VM) may have used the same page as this guest, and we
-	 * read incorrect data from the icache.  If we're using a PIPT cache,
-	 * we can invalidate just that page, but if we are using a VIPT cache
-	 * we need to invalidate the entire icache - damn shame - as written
-	 * in the ARM ARM (DDI 0406C.b - Page B3-1393).
-	 *
-	 * VIVT caches are tagged using both the ASID and the VMID and doesn't
-	 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
+	 * Clean the dcache to the Point of Coherency.
 	 *
 	 * We need to do this through a kernel mapping (using the
 	 * user-space mapping has proved to be the wrong
@@ -155,19 +146,52 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
 
 		kvm_flush_dcache_to_poc(va, PAGE_SIZE);
 
-		if (icache_is_pipt())
-			__cpuc_coherent_user_range((unsigned long)va,
-						   (unsigned long)va + PAGE_SIZE);
-
 		size -= PAGE_SIZE;
 		pfn++;
 
 		kunmap_atomic(va);
 	}
+}
 
-	if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
+static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
+						  kvm_pfn_t pfn,
+						  unsigned long size)
+{
+	/*
+	 * If we are going to insert an instruction page and the icache is
+	 * either VIPT or PIPT, there is a potential problem where the host
+	 * (or another VM) may have used the same page as this guest, and we
+	 * read incorrect data from the icache.  If we're using a PIPT cache,
+	 * we can invalidate just that page, but if we are using a VIPT cache
+	 * we need to invalidate the entire icache - damn shame - as written
+	 * in the ARM ARM (DDI 0406C.b - Page B3-1393).
+	 *
+	 * VIVT caches are tagged using both the ASID and the VMID and doesn't
+	 * need any kind of flushing (DDI 0406C.b - Page B3-1392).
+	 */
+
+	VM_BUG_ON(size & ~PAGE_MASK);
+
+	if (icache_is_vivt_asid_tagged())
+		return;
+
+	if (!icache_is_pipt()) {
 		/* any kind of VIPT cache */
 		__flush_icache_all();
+		return;
+	}
+
+	/* PIPT cache. As for the d-side, use a temporary kernel mapping. */
+	while (size) {
+		void *va = kmap_atomic_pfn(pfn);
+
+		__cpuc_coherent_user_range((unsigned long)va,
+					   (unsigned long)va + PAGE_SIZE);
+
+		size -= PAGE_SIZE;
+		pfn++;
+
+		kunmap_atomic(va);
 	}
 }
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 672c8684d5c2a..8034b96fb3a41 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -230,19 +230,26 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
-					       kvm_pfn_t pfn,
-					       unsigned long size)
+static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu,
+					     kvm_pfn_t pfn,
+					     unsigned long size)
 {
 	void *va = page_address(pfn_to_page(pfn));
 
 	kvm_flush_dcache_to_poc(va, size);
+}
 
+static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
+						  kvm_pfn_t pfn,
+						  unsigned long size)
+{
 	if (icache_is_aliasing()) {
 		/* any kind of VIPT cache */
 		__flush_icache_all();
 	} else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) {
 		/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
+		void *va = page_address(pfn_to_page(pfn));
+
 		flush_icache_range((unsigned long)va,
 				   (unsigned long)va + size);
 	}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b36945d49986d..2174244f63177 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1257,10 +1257,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
-				      unsigned long size)
+static void clean_dcache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+				    unsigned long size)
 {
-	__coherent_cache_guest_page(vcpu, pfn, size);
+	__clean_dcache_guest_page(vcpu, pfn, size);
+}
+
+static void invalidate_icache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
+					 unsigned long size)
+{
+	__invalidate_icache_guest_page(vcpu, pfn, size);
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address,
@@ -1391,7 +1397,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
 			kvm_set_pfn_dirty(pfn);
 		}
-		coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
+		clean_dcache_guest_page(vcpu, pfn, PMD_SIZE);
+		invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE);
+
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1401,7 +1409,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_pfn_dirty(pfn);
 			mark_page_dirty(kvm, gfn);
 		}
-		coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE);
+		clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE);
+		invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE);
+
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}
 

From 4fee94736603cd6fd83c1ea1ee0388d1d2dbe11b Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:16 +0100
Subject: [PATCH 097/676] arm64: KVM: Add invalidate_icache_range helper

We currently tightly couple dcache clean with icache invalidation,
but KVM could do without the initial flush to PoU, as we've
already flushed things to PoC.

Let's introduce invalidate_icache_range which is limited to
invalidating the icache from the linear mapping (and thus
has none of the userspace fault handling complexity), and
wire it in KVM instead of flush_icache_range.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/assembler.h  | 21 +++++++++++++++++++
 arch/arm64/include/asm/cacheflush.h |  7 +++++++
 arch/arm64/include/asm/kvm_mmu.h    |  4 ++--
 arch/arm64/mm/cache.S               | 32 ++++++++++++++++++++---------
 4 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index aef72d8866777..0884e1fdfd303 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -387,6 +387,27 @@ alternative_endif
 	dsb	\domain
 	.endm
 
+/*
+ * Macro to perform an instruction cache maintenance for the interval
+ * [start, end)
+ *
+ * 	start, end:	virtual addresses describing the region
+ *	label:		A label to branch to on user fault.
+ * 	Corrupts:	tmp1, tmp2
+ */
+	.macro invalidate_icache_by_line start, end, tmp1, tmp2, label
+	icache_line_size \tmp1, \tmp2
+	sub	\tmp2, \tmp1, #1
+	bic	\tmp2, \start, \tmp2
+9997:
+USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
+	add	\tmp2, \tmp2, \tmp1
+	cmp	\tmp2, \end
+	b.lo	9997b
+	dsb	ish
+	isb
+	.endm
+
 /*
  * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present
  */
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 955130762a3c6..bef9f418f0898 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -52,6 +52,12 @@
  *		- start  - virtual start address
  *		- end    - virtual end address
  *
+ *	invalidate_icache_range(start, end)
+ *
+ *		Invalidate the I-cache in the region described by start, end.
+ *		- start  - virtual start address
+ *		- end    - virtual end address
+ *
  *	__flush_cache_user_range(start, end)
  *
  *		Ensure coherency between the I-cache and the D-cache in the
@@ -66,6 +72,7 @@
  *		- size   - region size
  */
 extern void flush_icache_range(unsigned long start, unsigned long end);
+extern int  invalidate_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
 extern void __inval_dcache_area(void *addr, size_t len);
 extern void __clean_dcache_area_poc(void *addr, size_t len);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 8034b96fb3a41..56b3e03c85e74 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -250,8 +250,8 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
 		/* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */
 		void *va = page_address(pfn_to_page(pfn));
 
-		flush_icache_range((unsigned long)va,
-				   (unsigned long)va + size);
+		invalidate_icache_range((unsigned long)va,
+					(unsigned long)va + size);
 	}
 }
 
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 7f1dbe962cf58..bedd23da83f4e 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -60,16 +60,7 @@ user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	b.lo	1b
 	dsb	ish
 
-	icache_line_size x2, x3
-	sub	x3, x2, #1
-	bic	x4, x0, x3
-1:
-USER(9f, ic	ivau, x4	)		// invalidate I line PoU
-	add	x4, x4, x2
-	cmp	x4, x1
-	b.lo	1b
-	dsb	ish
-	isb
+	invalidate_icache_by_line x0, x1, x2, x3, 9f
 	mov	x0, #0
 1:
 	uaccess_ttbr0_disable x1
@@ -80,6 +71,27 @@ USER(9f, ic	ivau, x4	)		// invalidate I line PoU
 ENDPROC(flush_icache_range)
 ENDPROC(__flush_cache_user_range)
 
+/*
+ *	invalidate_icache_range(start,end)
+ *
+ *	Ensure that the I cache is invalid within specified region.
+ *
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+ENTRY(invalidate_icache_range)
+	uaccess_ttbr0_enable x2, x3
+
+	invalidate_icache_by_line x0, x1, x2, x3, 2f
+	mov	x0, xzr
+1:
+	uaccess_ttbr0_disable x1
+	ret
+2:
+	mov	x0, #-EFAULT
+	b	1b
+ENDPROC(invalidate_icache_range)
+
 /*
  *	__flush_dcache_area(kaddr, size)
  *

From 91c703e0382a1212d249adf34af4943a5da90d54 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:17 +0100
Subject: [PATCH 098/676] arm: KVM: Add optimized PIPT icache flushing

Calling __cpuc_coherent_user_range to invalidate the icache on
a PIPT icache machine has some pointless overhead, as it starts
by cleaning the dcache to the PoU, while we're guaranteed to
have already cleaned it to the PoC.

As KVM is the only user of such a feature, let's implement some
ad-hoc cache flushing in kvm_mmu.h. Should it become useful to
other subsystems, it can be moved to a more global location.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_hyp.h |  2 ++
 arch/arm/include/asm/kvm_mmu.h | 32 +++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index 76368de7237ba..1ab8329e9ff75 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -68,6 +68,8 @@
 #define HIFAR		__ACCESS_CP15(c6, 4, c0, 2)
 #define HPFAR		__ACCESS_CP15(c6, 4, c0, 4)
 #define ICIALLUIS	__ACCESS_CP15(c7, 0, c1, 0)
+#define BPIALLIS	__ACCESS_CP15(c7, 0, c1, 6)
+#define ICIMVAU		__ACCESS_CP15(c7, 0, c5, 1)
 #define ATS1CPR		__ACCESS_CP15(c7, 0, c8, 0)
 #define TLBIALLIS	__ACCESS_CP15(c8, 0, c3, 0)
 #define TLBIALL		__ACCESS_CP15(c8, 0, c7, 0)
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 9fa4b2520974d..bc8d21e76637f 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -37,6 +37,8 @@
 
 #include <linux/highmem.h>
 #include <asm/cacheflush.h>
+#include <asm/cputype.h>
+#include <asm/kvm_hyp.h>
 #include <asm/pgalloc.h>
 #include <asm/stage2_pgtable.h>
 
@@ -157,6 +159,8 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
 						  kvm_pfn_t pfn,
 						  unsigned long size)
 {
+	u32 iclsz;
+
 	/*
 	 * If we are going to insert an instruction page and the icache is
 	 * either VIPT or PIPT, there is a potential problem where the host
@@ -181,18 +185,40 @@ static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
 		return;
 	}
 
-	/* PIPT cache. As for the d-side, use a temporary kernel mapping. */
+	/*
+	 * CTR IminLine contains Log2 of the number of words in the
+	 * cache line, so we can get the number of words as
+	 * 2 << (IminLine - 1).  To get the number of bytes, we
+	 * multiply by 4 (the number of bytes in a 32-bit word), and
+	 * get 4 << (IminLine).
+	 */
+	iclsz = 4 << (read_cpuid(CPUID_CACHETYPE) & 0xf);
+
 	while (size) {
 		void *va = kmap_atomic_pfn(pfn);
+		void *end = va + PAGE_SIZE;
+		void *addr = va;
 
-		__cpuc_coherent_user_range((unsigned long)va,
-					   (unsigned long)va + PAGE_SIZE);
+		do {
+			write_sysreg(addr, ICIMVAU);
+			addr += iclsz;
+		} while (addr < end);
+
+		dsb(ishst);
+		isb();
 
 		size -= PAGE_SIZE;
 		pfn++;
 
 		kunmap_atomic(va);
 	}
+
+	/* Check if we need to invalidate the BTB */
+	if ((read_cpuid_ext(CPUID_EXT_MMFR1) >> 28) != 4) {
+		write_sysreg(0, BPIALLIS);
+		dsb(ishst);
+		isb();
+	}
 }
 
 static inline void __kvm_flush_dcache_pte(pte_t pte)

From fefb876b9b96fa7e4ed3d906979ea45b4cf07349 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:18 +0100
Subject: [PATCH 099/676] arm64: KVM: PTE/PMD S2 XN bit definition

As we're about to make S2 page-tables eXecute Never by default,
add the required bits for both PMDs and PTEs.

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index eb0c2bd90de90..af035331fb096 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -177,9 +177,11 @@
  */
 #define PTE_S2_RDONLY		(_AT(pteval_t, 1) << 6)   /* HAP[2:1] */
 #define PTE_S2_RDWR		(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
+#define PTE_S2_XN		(_AT(pteval_t, 2) << 53)  /* XN[1:0] */
 
 #define PMD_S2_RDONLY		(_AT(pmdval_t, 1) << 6)   /* HAP[2:1] */
 #define PMD_S2_RDWR		(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
+#define PMD_S2_XN		(_AT(pmdval_t, 2) << 53)  /* XN[1:0] */
 
 /*
  * Memory Attribute override for Stage-2 (MemAttr[3:0])

From d0e22b4ac3ba23c611739f554392bf5e217df49f Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:19 +0100
Subject: [PATCH 100/676] KVM: arm/arm64: Limit icache invalidation to prefetch
 aborts

We've so far eagerly invalidated the icache, no matter how
the page was faulted in (data or prefetch abort).

But we can easily track execution by setting the XN bits
in the S2 page tables, get the prefetch abort at HYP and
perform the icache invalidation at that time only.

As for most VMs, the instruction working set is pretty
small compared to the data set, this is likely to save
some traffic (specially as the invalidation is broadcast).

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h        | 12 ++++++++++++
 arch/arm/include/asm/pgtable.h        |  4 ++--
 arch/arm64/include/asm/kvm_mmu.h      | 12 ++++++++++++
 arch/arm64/include/asm/pgtable-prot.h |  4 ++--
 virt/kvm/arm/mmu.c                    | 19 +++++++++++++++----
 5 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index bc8d21e76637f..4d7a54cbb3ab5 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -85,6 +85,18 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
 	return pmd;
 }
 
+static inline pte_t kvm_s2pte_mkexec(pte_t pte)
+{
+	pte_val(pte) &= ~L_PTE_XN;
+	return pte;
+}
+
+static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~PMD_SECT_XN;
+	return pmd;
+}
+
 static inline void kvm_set_s2pte_readonly(pte_t *pte)
 {
 	pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY;
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 150ece66ddf34..a757401129f95 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -102,8 +102,8 @@ extern pgprot_t		pgprot_s2_device;
 #define PAGE_HYP_EXEC		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
 #define PAGE_HYP_RO		_MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
 #define PAGE_HYP_DEVICE		_MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
-#define PAGE_S2			_MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
-#define PAGE_S2_DEVICE		_MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
+#define PAGE_S2			_MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY | L_PTE_XN)
+#define PAGE_S2_DEVICE		_MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY | L_PTE_XN)
 
 #define __PAGE_NONE		__pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN | L_PTE_NONE)
 #define __PAGE_SHARED		__pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 56b3e03c85e74..1e1b20cb348f2 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -173,6 +173,18 @@ static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
 	return pmd;
 }
 
+static inline pte_t kvm_s2pte_mkexec(pte_t pte)
+{
+	pte_val(pte) &= ~PTE_S2_XN;
+	return pte;
+}
+
+static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~PMD_S2_XN;
+	return pmd;
+}
+
 static inline void kvm_set_s2pte_readonly(pte_t *pte)
 {
 	pteval_t old_pteval, pteval;
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 0a5635fb0ef98..4e12dabd342b0 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -60,8 +60,8 @@
 #define PAGE_HYP_RO		__pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
 #define PAGE_HYP_DEVICE		__pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
 
-#define PAGE_S2			__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
-#define PAGE_S2_DEVICE		__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN)
+#define PAGE_S2			__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY | PTE_S2_XN)
+#define PAGE_S2_DEVICE		__pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
 
 #define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_PXN | PTE_UXN)
 #define PAGE_SHARED		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 2174244f63177..0417c8e2a81cb 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1292,7 +1292,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  unsigned long fault_status)
 {
 	int ret;
-	bool write_fault, writable, hugetlb = false, force_pte = false;
+	bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
 	unsigned long mmu_seq;
 	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
 	struct kvm *kvm = vcpu->kvm;
@@ -1304,7 +1304,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	unsigned long flags = 0;
 
 	write_fault = kvm_is_write_fault(vcpu);
-	if (fault_status == FSC_PERM && !write_fault) {
+	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+	VM_BUG_ON(write_fault && exec_fault);
+
+	if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
 		kvm_err("Unexpected L2 read permission error\n");
 		return -EFAULT;
 	}
@@ -1398,7 +1401,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_pfn_dirty(pfn);
 		}
 		clean_dcache_guest_page(vcpu, pfn, PMD_SIZE);
-		invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE);
+
+		if (exec_fault) {
+			new_pmd = kvm_s2pmd_mkexec(new_pmd);
+			invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE);
+		}
 
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
@@ -1410,7 +1417,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			mark_page_dirty(kvm, gfn);
 		}
 		clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE);
-		invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE);
+
+		if (exec_fault) {
+			new_pte = kvm_s2pte_mkexec(new_pte);
+			invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE);
+		}
 
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
 	}

From a9c0e12ebee56ef06b7eccdbc73bab71d0018df8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:20 +0100
Subject: [PATCH 101/676] KVM: arm/arm64: Only clean the dcache on translation
 fault

The only case where we actually need to perform a dcache maintenance
is when we map the page for the first time, and subsequent permission
faults do not require cache maintenance. Let's make it conditional
on not being a permission fault (and thus a translation fault).

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 0417c8e2a81cb..f956efbd933d2 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1400,7 +1400,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
 			kvm_set_pfn_dirty(pfn);
 		}
-		clean_dcache_guest_page(vcpu, pfn, PMD_SIZE);
+
+		if (fault_status != FSC_PERM)
+			clean_dcache_guest_page(vcpu, pfn, PMD_SIZE);
 
 		if (exec_fault) {
 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
@@ -1416,7 +1418,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_pfn_dirty(pfn);
 			mark_page_dirty(kvm, gfn);
 		}
-		clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE);
+
+		if (fault_status != FSC_PERM)
+			clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE);
 
 		if (exec_fault) {
 			new_pte = kvm_s2pte_mkexec(new_pte);

From 7a3796d2ef5bb948f709467eef1bf96edbfc67a0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:21 +0100
Subject: [PATCH 102/676] KVM: arm/arm64: Preserve Exec permission across R/W
 permission faults

So far, we loose the Exec property whenever we take permission
faults, as we always reconstruct the PTE/PMD from scratch. This
can be counter productive as we can end-up with the following
fault sequence:

	X -> RO -> ROX -> RW -> RWX

Instead, we can lookup the existing PTE/PMD and clear the XN bit in the
new entry if it was already cleared in the old one, leadig to a much
nicer fault sequence:

	X -> ROX -> RWX

Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   | 10 ++++++++++
 arch/arm64/include/asm/kvm_mmu.h | 10 ++++++++++
 virt/kvm/arm/mmu.c               | 27 +++++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 4d7a54cbb3ab5..aab64fe521465 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -107,6 +107,11 @@ static inline bool kvm_s2pte_readonly(pte_t *pte)
 	return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY;
 }
 
+static inline bool kvm_s2pte_exec(pte_t *pte)
+{
+	return !(pte_val(*pte) & L_PTE_XN);
+}
+
 static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 {
 	pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY;
@@ -117,6 +122,11 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 	return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY;
 }
 
+static inline bool kvm_s2pmd_exec(pmd_t *pmd)
+{
+	return !(pmd_val(*pmd) & PMD_SECT_XN);
+}
+
 static inline bool kvm_page_empty(void *ptr)
 {
 	struct page *ptr_page = virt_to_page(ptr);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 1e1b20cb348f2..126abefffe7f6 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -203,6 +203,11 @@ static inline bool kvm_s2pte_readonly(pte_t *pte)
 	return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY;
 }
 
+static inline bool kvm_s2pte_exec(pte_t *pte)
+{
+	return !(pte_val(*pte) & PTE_S2_XN);
+}
+
 static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
 {
 	kvm_set_s2pte_readonly((pte_t *)pmd);
@@ -213,6 +218,11 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
 	return kvm_s2pte_readonly((pte_t *)pmd);
 }
 
+static inline bool kvm_s2pmd_exec(pmd_t *pmd)
+{
+	return !(pmd_val(*pmd) & PMD_S2_XN);
+}
+
 static inline bool kvm_page_empty(void *ptr)
 {
 	struct page *ptr_page = virt_to_page(ptr);
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index f956efbd933d2..b83b5a8442bb6 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -926,6 +926,25 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
 	return 0;
 }
 
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	pmdp = stage2_get_pmd(kvm, NULL, addr);
+	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
+		return false;
+
+	if (pmd_thp_or_huge(*pmdp))
+		return kvm_s2pmd_exec(pmdp);
+
+	ptep = pte_offset_kernel(pmdp, addr);
+	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
+		return false;
+
+	return kvm_s2pte_exec(ptep);
+}
+
 static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
 			  phys_addr_t addr, const pte_t *new_pte,
 			  unsigned long flags)
@@ -1407,6 +1426,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		if (exec_fault) {
 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
 			invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE);
+		} else if (fault_status == FSC_PERM) {
+			/* Preserve execute if XN was already cleared */
+			if (stage2_is_exec(kvm, fault_ipa))
+				new_pmd = kvm_s2pmd_mkexec(new_pmd);
 		}
 
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
@@ -1425,6 +1448,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		if (exec_fault) {
 			new_pte = kvm_s2pte_mkexec(new_pte);
 			invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE);
+		} else if (fault_status == FSC_PERM) {
+			/* Preserve execute if XN was already cleared */
+			if (stage2_is_exec(kvm, fault_ipa))
+				new_pte = kvm_s2pte_mkexec(new_pte);
 		}
 
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);

From 17ab9d57debaa53d665651e425a0efc4a893c039 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 23 Oct 2017 17:11:22 +0100
Subject: [PATCH 103/676] KVM: arm/arm64: Drop vcpu parameter from guest cache
 maintenance operartions

The vcpu parameter isn't used for anything, and gets in the way of
further cleanups. Let's get rid of it.

Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_mmu.h   |  7 ++-----
 arch/arm64/include/asm/kvm_mmu.h |  7 ++-----
 virt/kvm/arm/mmu.c               | 18 ++++++++----------
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index aab64fe521465..bc70a1f0f42d4 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -150,9 +150,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu,
-					     kvm_pfn_t pfn,
-					     unsigned long size)
+static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
 	/*
 	 * Clean the dcache to the Point of Coherency.
@@ -177,8 +175,7 @@ static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu,
 	}
 }
 
-static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
-						  kvm_pfn_t pfn,
+static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
 						  unsigned long size)
 {
 	u32 iclsz;
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 126abefffe7f6..06f1f97946799 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -252,17 +252,14 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 	return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
 }
 
-static inline void __clean_dcache_guest_page(struct kvm_vcpu *vcpu,
-					     kvm_pfn_t pfn,
-					     unsigned long size)
+static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
 	void *va = page_address(pfn_to_page(pfn));
 
 	kvm_flush_dcache_to_poc(va, size);
 }
 
-static inline void __invalidate_icache_guest_page(struct kvm_vcpu *vcpu,
-						  kvm_pfn_t pfn,
+static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
 						  unsigned long size)
 {
 	if (icache_is_aliasing()) {
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b83b5a8442bb6..a1ea43fa75cfd 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1276,16 +1276,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static void clean_dcache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
-				    unsigned long size)
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
-	__clean_dcache_guest_page(vcpu, pfn, size);
+	__clean_dcache_guest_page(pfn, size);
 }
 
-static void invalidate_icache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
-					 unsigned long size)
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
 {
-	__invalidate_icache_guest_page(vcpu, pfn, size);
+	__invalidate_icache_guest_page(pfn, size);
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address,
@@ -1421,11 +1419,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		}
 
 		if (fault_status != FSC_PERM)
-			clean_dcache_guest_page(vcpu, pfn, PMD_SIZE);
+			clean_dcache_guest_page(pfn, PMD_SIZE);
 
 		if (exec_fault) {
 			new_pmd = kvm_s2pmd_mkexec(new_pmd);
-			invalidate_icache_guest_page(vcpu, pfn, PMD_SIZE);
+			invalidate_icache_guest_page(pfn, PMD_SIZE);
 		} else if (fault_status == FSC_PERM) {
 			/* Preserve execute if XN was already cleared */
 			if (stage2_is_exec(kvm, fault_ipa))
@@ -1443,11 +1441,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		}
 
 		if (fault_status != FSC_PERM)
-			clean_dcache_guest_page(vcpu, pfn, PAGE_SIZE);
+			clean_dcache_guest_page(pfn, PAGE_SIZE);
 
 		if (exec_fault) {
 			new_pte = kvm_s2pte_mkexec(new_pte);
-			invalidate_icache_guest_page(vcpu, pfn, PAGE_SIZE);
+			invalidate_icache_guest_page(pfn, PAGE_SIZE);
 		} else if (fault_status == FSC_PERM) {
 			/* Preserve execute if XN was already cleared */
 			if (stage2_is_exec(kvm, fault_ipa))

From 448fadc8a4a36cad560e07b4fbc94cba49526956 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 9 Jan 2018 11:51:58 +0100
Subject: [PATCH 104/676] arm64: mm: Add additional parameter to
 uaccess_ttbr0_enable

Add an extra temporary register parameter to uaccess_ttbr0_enable which
is about to be required for arm64 PAN support.

This patch doesn't introduce any functional change but ensures that the
kernel compiles once the KVM/ARM tree is merged with the arm64 tree by
ensuring a trivially mergable conflict with commit
27a921e75711d924617269e0ba4adb8bae9fd0d1
("arm64: mm: Fix and re-enable ARM64_SW_TTBR0_PAN").

Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/asm-uaccess.h | 4 ++--
 arch/arm64/mm/cache.S                | 4 ++--
 arch/arm64/xen/hypercall.S           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index b3da6c8868359..b67563d2024c7 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -31,7 +31,7 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 alternative_if_not ARM64_HAS_PAN
 	save_and_disable_irq \tmp2		// avoid preemption
 	__uaccess_ttbr0_enable \tmp1
@@ -42,7 +42,7 @@ alternative_else_nop_endif
 	.macro	uaccess_ttbr0_disable, tmp1
 	.endm
 
-	.macro	uaccess_ttbr0_enable, tmp1, tmp2
+	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
 	.endm
 #endif
 
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index bedd23da83f4e..5a52811f47e9d 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -49,7 +49,7 @@ ENTRY(flush_icache_range)
  *	- end     - virtual end address of region
  */
 ENTRY(__flush_cache_user_range)
-	uaccess_ttbr0_enable x2, x3
+	uaccess_ttbr0_enable x2, x3, x4
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	bic	x4, x0, x3
@@ -80,7 +80,7 @@ ENDPROC(__flush_cache_user_range)
  *	- end     - virtual end address of region
  */
 ENTRY(invalidate_icache_range)
-	uaccess_ttbr0_enable x2, x3
+	uaccess_ttbr0_enable x2, x3, x4
 
 	invalidate_icache_by_line x0, x1, x2, x3, 2f
 	mov	x0, xzr
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index 401ceb71540c7..acdbd2c9e899c 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -101,7 +101,7 @@ ENTRY(privcmd_call)
 	 * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation
 	 * is enabled (it implies that hardware UAO and PAN disabled).
 	 */
-	uaccess_ttbr0_enable x6, x7
+	uaccess_ttbr0_enable x6, x7, x8
 	hvc XEN_IMM
 
 	/*

From 81ceca05a42e24768c73d4d6dbd3701a60f1ed85 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 19 Dec 2017 15:56:24 +0100
Subject: [PATCH 105/676] KVM: PPC: Book3S HV: Remove vcpu->arch.dec usage

On Book3S in HV mode, we don't use the vcpu->arch.dec field at all.
Instead, all logic is built around vcpu->arch.dec_expires.

So let's remove the one remaining piece of code that was setting it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2659844784b81..c8ffd69adfec7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -957,7 +957,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	mftb	r7
 	subf	r3,r7,r8
 	mtspr	SPRN_DEC,r3
-	std	r3,VCPU_DEC(r4)
 
 	ld	r5, VCPU_SPRG0(r4)
 	ld	r6, VCPU_SPRG1(r4)

From 1627301020cb460f5a74e13c291f8db3b2a8062e Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 7 Jan 2018 10:07:36 +0100
Subject: [PATCH 106/676] KVM: PPC: Use seq_puts() in kvmppc_exit_timing_show()

A headline should be quickly put into a sequence. Thus use the
function "seq_puts" instead of "seq_printf" for this purpose.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/timing.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index e44d2b2ea97e3..1c03c978eb184 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
 	int i;
 	u64 min, max, sum, sum_quad;
 
-	seq_printf(m, "%s", "type	count	min	max	sum	sum_squared\n");
-
+	seq_puts(m, "type	count	min	max	sum	sum_squared\n");
 
 	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
 

From 094bb5d766cfcdae47e332c6d6713c7029241be1 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <rasmus.villemoes@prevas.dk>
Date: Tue, 21 Nov 2017 01:12:43 +0100
Subject: [PATCH 107/676] target-core: don't use "const char*" for a buffer
 that is written to

iscsi_parse_pr_out_transport_id launders the const away via a call to
strstr(), and then modifies the buffer (writing a nul byte) through
the return value. It's cleaner to be honest and simply declare the
parameter as "char*", fixing up the call chain, and allowing us to
drop the cast in the return statement.

Amusingly, the two current callers found it necessary to cast a
non-const pointer to a const.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_fabric_lib.c | 6 +++---
 drivers/target/target_core_internal.h   | 2 +-
 drivers/target/target_core_pr.c         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c
index 508da345b73fd..71a80257a0526 100644
--- a/drivers/target/target_core_fabric_lib.c
+++ b/drivers/target/target_core_fabric_lib.c
@@ -273,7 +273,7 @@ static int iscsi_get_pr_transport_id_len(
 
 static char *iscsi_parse_pr_out_transport_id(
 	struct se_portal_group *se_tpg,
-	const char *buf,
+	char *buf,
 	u32 *out_tid_len,
 	char **port_nexus_ptr)
 {
@@ -356,7 +356,7 @@ static char *iscsi_parse_pr_out_transport_id(
 		}
 	}
 
-	return (char *)&buf[4];
+	return &buf[4];
 }
 
 int target_get_pr_transport_id_len(struct se_node_acl *nacl,
@@ -405,7 +405,7 @@ int target_get_pr_transport_id(struct se_node_acl *nacl,
 }
 
 const char *target_parse_pr_out_transport_id(struct se_portal_group *tpg,
-		const char *buf, u32 *out_tid_len, char **port_nexus_ptr)
+		char *buf, u32 *out_tid_len, char **port_nexus_ptr)
 {
 	u32 offset;
 
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 9384d19a7326c..6d53d9fcb883b 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -102,7 +102,7 @@ int	target_get_pr_transport_id(struct se_node_acl *nacl,
 		struct t10_pr_registration *pr_reg, int *format_code,
 		unsigned char *buf);
 const char *target_parse_pr_out_transport_id(struct se_portal_group *tpg,
-		const char *buf, u32 *out_tid_len, char **port_nexus_ptr);
+		char *buf, u32 *out_tid_len, char **port_nexus_ptr);
 
 /* target_core_hba.c */
 struct se_hba *core_alloc_hba(const char *, u32, u32);
diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c
index b024613f92171..01ac306131c1f 100644
--- a/drivers/target/target_core_pr.c
+++ b/drivers/target/target_core_pr.c
@@ -1601,7 +1601,7 @@ core_scsi3_decode_spec_i_port(
 			dest_rtpi = tmp_lun->lun_rtpi;
 
 			i_str = target_parse_pr_out_transport_id(tmp_tpg,
-					(const char *)ptr, &tid_len, &iport_ptr);
+					ptr, &tid_len, &iport_ptr);
 			if (!i_str)
 				continue;
 
@@ -3287,7 +3287,7 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key,
 		goto out;
 	}
 	initiator_str = target_parse_pr_out_transport_id(dest_se_tpg,
-			(const char *)&buf[24], &tmp_tid_len, &iport_ptr);
+			&buf[24], &tmp_tid_len, &iport_ptr);
 	if (!initiator_str) {
 		pr_err("SPC-3 PR REGISTER_AND_MOVE: Unable to locate"
 			" initiator_str from Transport ID\n");

From 26d2b3106f6015b1d19ae5f8b0cc1fc7fe8e669e Mon Sep 17 00:00:00 2001
From: tangwenji <tang.wenji@zte.com.cn>
Date: Tue, 28 Nov 2017 12:40:27 -0600
Subject: [PATCH 108/676] tcmu: fix page addr in tcmu_flush_dcache_range

The page addr should be update.

Signed-off-by: tangwenji <tang.wenji@zte.com.cn>
Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index a415d87f22d24..6bcaa8b5684c0 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -455,12 +455,13 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 static inline void tcmu_flush_dcache_range(void *vaddr, size_t size)
 {
 	unsigned long offset = offset_in_page(vaddr);
+	void *start = vaddr - offset;
 
 	size = round_up(size+offset, PAGE_SIZE);
-	vaddr -= offset;
 
 	while (size) {
-		flush_dcache_page(virt_to_page(vaddr));
+		flush_dcache_page(virt_to_page(start));
+		start += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 }

From bf99ec13327bb5b0f6475aea8735c0ca34cc2a26 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:28 -0600
Subject: [PATCH 109/676] tcmu: merge common block release code

Have unmap_thread_fn use tcmu_blocks_release.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 6bcaa8b5684c0..d9fd91ee8282d 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1302,21 +1302,19 @@ static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
 	return -EINVAL;
 }
 
-static void tcmu_blocks_release(struct tcmu_dev *udev)
+static void tcmu_blocks_release(struct radix_tree_root *blocks,
+				int start, int end)
 {
 	int i;
 	struct page *page;
 
-	/* Try to release all block pages */
-	mutex_lock(&udev->cmdr_lock);
-	for (i = 0; i <= udev->dbi_max; i++) {
-		page = radix_tree_delete(&udev->data_blocks, i);
+	for (i = start; i < end; i++) {
+		page = radix_tree_delete(blocks, i);
 		if (page) {
 			__free_page(page);
 			atomic_dec(&global_db_count);
 		}
 	}
-	mutex_unlock(&udev->cmdr_lock);
 }
 
 static void tcmu_dev_kref_release(struct kref *kref)
@@ -1340,7 +1338,9 @@ static void tcmu_dev_kref_release(struct kref *kref)
 	spin_unlock_irq(&udev->commands_lock);
 	WARN_ON(!all_expired);
 
-	tcmu_blocks_release(udev);
+	mutex_lock(&udev->cmdr_lock);
+	tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1);
+	mutex_unlock(&udev->cmdr_lock);
 
 	call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
 }
@@ -1978,8 +1978,6 @@ static int unmap_thread_fn(void *data)
 	struct tcmu_dev *udev;
 	loff_t off;
 	uint32_t start, end, block;
-	struct page *page;
-	int i;
 
 	while (!kthread_should_stop()) {
 		DEFINE_WAIT(__wait);
@@ -2027,13 +2025,7 @@ static int unmap_thread_fn(void *data)
 			unmap_mapping_range(udev->inode->i_mapping, off, 0, 1);
 
 			/* Release the block pages */
-			for (i = start; i < end; i++) {
-				page = radix_tree_delete(&udev->data_blocks, i);
-				if (page) {
-					__free_page(page);
-					atomic_dec(&global_db_count);
-				}
-			}
+			tcmu_blocks_release(&udev->data_blocks, start, end);
 			mutex_unlock(&udev->cmdr_lock);
 		}
 

From 89ec9cfd3b644fbc36047e36776509130d2fc1ec Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:29 -0600
Subject: [PATCH 110/676] tcmu: split unmap_thread_fn

Separate unmap_thread_fn to make it easier to read.

Note: this patch does not fix the bug where we might
miss a wake up call. The next patch will fix that.
This patch only separates the code into functions.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 120 +++++++++++++++++-------------
 1 file changed, 70 insertions(+), 50 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index d9fd91ee8282d..cab6c72eb012a 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1973,71 +1973,91 @@ static struct target_backend_ops tcmu_ops = {
 	.tb_dev_attrib_attrs	= NULL,
 };
 
-static int unmap_thread_fn(void *data)
+
+static void find_free_blocks(void)
 {
 	struct tcmu_dev *udev;
 	loff_t off;
 	uint32_t start, end, block;
 
-	while (!kthread_should_stop()) {
-		DEFINE_WAIT(__wait);
-
-		prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE);
-		schedule();
-		finish_wait(&unmap_wait, &__wait);
+	mutex_lock(&root_udev_mutex);
+	list_for_each_entry(udev, &root_udev, node) {
+		mutex_lock(&udev->cmdr_lock);
 
-		if (kthread_should_stop())
-			break;
+		/* Try to complete the finished commands first */
+		tcmu_handle_completions(udev);
 
-		mutex_lock(&root_udev_mutex);
-		list_for_each_entry(udev, &root_udev, node) {
-			mutex_lock(&udev->cmdr_lock);
+		/* Skip the udevs waiting the global pool or in idle */
+		if (udev->waiting_global || !udev->dbi_thresh) {
+			mutex_unlock(&udev->cmdr_lock);
+			continue;
+		}
 
-			/* Try to complete the finished commands first */
-			tcmu_handle_completions(udev);
+		end = udev->dbi_max + 1;
+		block = find_last_bit(udev->data_bitmap, end);
+		if (block == udev->dbi_max) {
+			/*
+			 * The last bit is dbi_max, so there is
+			 * no need to shrink any blocks.
+			 */
+			mutex_unlock(&udev->cmdr_lock);
+			continue;
+		} else if (block == end) {
+			/* The current udev will goto idle state */
+			udev->dbi_thresh = start = 0;
+			udev->dbi_max = 0;
+		} else {
+			udev->dbi_thresh = start = block + 1;
+			udev->dbi_max = block;
+		}
 
-			/* Skip the udevs waiting the global pool or in idle */
-			if (udev->waiting_global || !udev->dbi_thresh) {
-				mutex_unlock(&udev->cmdr_lock);
-				continue;
-			}
+		/* Here will truncate the data area from off */
+		off = udev->data_off + start * DATA_BLOCK_SIZE;
+		unmap_mapping_range(udev->inode->i_mapping, off, 0, 1);
 
-			end = udev->dbi_max + 1;
-			block = find_last_bit(udev->data_bitmap, end);
-			if (block == udev->dbi_max) {
-				/*
-				 * The last bit is dbi_max, so there is
-				 * no need to shrink any blocks.
-				 */
-				mutex_unlock(&udev->cmdr_lock);
-				continue;
-			} else if (block == end) {
-				/* The current udev will goto idle state */
-				udev->dbi_thresh = start = 0;
-				udev->dbi_max = 0;
-			} else {
-				udev->dbi_thresh = start = block + 1;
-				udev->dbi_max = block;
-			}
+		/* Release the block pages */
+		tcmu_blocks_release(&udev->data_blocks, start, end);
+		mutex_unlock(&udev->cmdr_lock);
+	}
+	mutex_unlock(&root_udev_mutex);
+}
 
-			/* Here will truncate the data area from off */
-			off = udev->data_off + start * DATA_BLOCK_SIZE;
-			unmap_mapping_range(udev->inode->i_mapping, off, 0, 1);
+static void run_cmdr_queues(void)
+{
+	struct tcmu_dev *udev;
 
-			/* Release the block pages */
-			tcmu_blocks_release(&udev->data_blocks, start, end);
+	/*
+	 * Try to wake up the udevs who are waiting
+	 * for the global data block pool.
+	 */
+	mutex_lock(&root_udev_mutex);
+	list_for_each_entry(udev, &root_udev, node) {
+		mutex_lock(&udev->cmdr_lock);
+		if (!udev->waiting_global) {
 			mutex_unlock(&udev->cmdr_lock);
+			break;
 		}
+		mutex_unlock(&udev->cmdr_lock);
 
-		/*
-		 * Try to wake up the udevs who are waiting
-		 * for the global data pool.
-		 */
-		list_for_each_entry(udev, &root_udev, node) {
-			if (udev->waiting_global)
-				wake_up(&udev->wait_cmdr);
-		}
-		mutex_unlock(&root_udev_mutex);
+		wake_up(&udev->wait_cmdr);
+	}
+	mutex_unlock(&root_udev_mutex);
+}
+
+static int unmap_thread_fn(void *data)
+{
+	while (!kthread_should_stop()) {
+		DEFINE_WAIT(__wait);
+
+		prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&unmap_wait, &__wait);
+
+		if (kthread_should_stop())
+			break;
+
+		find_free_blocks();
+		run_cmdr_queues();
 	}
 
 	return 0;

From 9972cebb59a653cca735178a70c8ab09a5f4de1a Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:30 -0600
Subject: [PATCH 111/676] tcmu: fix unmap thread race

If the unmap thread has already run find_free_blocks
but not yet run prepare_to_wait when a wake_up(&unmap_wait)
call is done, the unmap thread is going to miss the wake
call. Instead of adding checks for if new waiters were added
this just has us use a work queue which will run us again
in this type of case.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 43 +++++++------------------------
 1 file changed, 10 insertions(+), 33 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index cab6c72eb012a..a9f5c52e8b1d9 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -32,7 +32,7 @@
 #include <linux/highmem.h>
 #include <linux/configfs.h>
 #include <linux/mutex.h>
-#include <linux/kthread.h>
+#include <linux/workqueue.h>
 #include <net/genetlink.h>
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
@@ -176,12 +176,11 @@ struct tcmu_cmd {
 	unsigned long flags;
 };
 
-static struct task_struct *unmap_thread;
-static wait_queue_head_t unmap_wait;
 static DEFINE_MUTEX(root_udev_mutex);
 static LIST_HEAD(root_udev);
 
 static atomic_t global_db_count = ATOMIC_INIT(0);
+static struct work_struct tcmu_unmap_work;
 
 static struct kmem_cache *tcmu_cmd_cache;
 
@@ -389,8 +388,7 @@ static bool tcmu_get_empty_blocks(struct tcmu_dev *udev,
 
 err:
 	udev->waiting_global = true;
-	/* Try to wake up the unmap thread */
-	wake_up(&unmap_wait);
+	schedule_work(&tcmu_unmap_work);
 	return false;
 }
 
@@ -1065,8 +1063,7 @@ static void tcmu_device_timedout(struct timer_list *t)
 	idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
 	spin_unlock_irqrestore(&udev->commands_lock, flags);
 
-	/* Try to wake up the ummap thread */
-	wake_up(&unmap_wait);
+	schedule_work(&tcmu_unmap_work);
 
 	/*
 	 * We don't need to wakeup threads on wait_cmdr since they have their
@@ -2044,23 +2041,10 @@ static void run_cmdr_queues(void)
 	mutex_unlock(&root_udev_mutex);
 }
 
-static int unmap_thread_fn(void *data)
+static void tcmu_unmap_work_fn(struct work_struct *work)
 {
-	while (!kthread_should_stop()) {
-		DEFINE_WAIT(__wait);
-
-		prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE);
-		schedule();
-		finish_wait(&unmap_wait, &__wait);
-
-		if (kthread_should_stop())
-			break;
-
-		find_free_blocks();
-		run_cmdr_queues();
-	}
-
-	return 0;
+	find_free_blocks();
+	run_cmdr_queues();
 }
 
 static int __init tcmu_module_init(void)
@@ -2069,6 +2053,8 @@ static int __init tcmu_module_init(void)
 
 	BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0);
 
+	INIT_WORK(&tcmu_unmap_work, tcmu_unmap_work_fn);
+
 	tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache",
 				sizeof(struct tcmu_cmd),
 				__alignof__(struct tcmu_cmd),
@@ -2114,17 +2100,8 @@ static int __init tcmu_module_init(void)
 	if (ret)
 		goto out_attrs;
 
-	init_waitqueue_head(&unmap_wait);
-	unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap");
-	if (IS_ERR(unmap_thread)) {
-		ret = PTR_ERR(unmap_thread);
-		goto out_unreg_transport;
-	}
-
 	return 0;
 
-out_unreg_transport:
-	target_backend_unregister(&tcmu_ops);
 out_attrs:
 	kfree(tcmu_attrs);
 out_unreg_genl:
@@ -2139,7 +2116,7 @@ static int __init tcmu_module_init(void)
 
 static void __exit tcmu_module_exit(void)
 {
-	kthread_stop(unmap_thread);
+	cancel_work_sync(&tcmu_unmap_work);
 	target_backend_unregister(&tcmu_ops);
 	kfree(tcmu_attrs);
 	genl_unregister_family(&tcmu_genl_family);

From 488ebe4c355fdead39dbb3f6a51329c16cbfcc60 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:31 -0600
Subject: [PATCH 112/676] tcmu: move expired command completion to unmap thread

This moves the expired command completion handling to
the unmap wq, so the next patch can use a mutex
in tcmu_check_expired_cmd.

Note:
tcmu_device_timedout's use of spin_lock_irq was not needed.
The commands_lock is used between thread context (tcmu_queue_cmd_ring
and tcmu_irqcontrol (even though this is named irqcontrol it is not
run in irq context)) and timer/bh context. In the timer/bh context
bhs are disabled, so you need to use the _bh lock calls from the
thread context callers.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 48 +++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index a9f5c52e8b1d9..2ccc8e61449bf 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -143,6 +143,7 @@ struct tcmu_dev {
 
 	struct timer_list timeout;
 	unsigned int cmd_time_out;
+	struct list_head timedout_entry;
 
 	spinlock_t nl_cmd_lock;
 	struct tcmu_nl_cmd curr_nl_cmd;
@@ -179,6 +180,9 @@ struct tcmu_cmd {
 static DEFINE_MUTEX(root_udev_mutex);
 static LIST_HEAD(root_udev);
 
+static DEFINE_SPINLOCK(timed_out_udevs_lock);
+static LIST_HEAD(timed_out_udevs);
+
 static atomic_t global_db_count = ATOMIC_INIT(0);
 static struct work_struct tcmu_unmap_work;
 
@@ -1057,18 +1061,15 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 static void tcmu_device_timedout(struct timer_list *t)
 {
 	struct tcmu_dev *udev = from_timer(udev, t, timeout);
-	unsigned long flags;
 
-	spin_lock_irqsave(&udev->commands_lock, flags);
-	idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
-	spin_unlock_irqrestore(&udev->commands_lock, flags);
+	pr_debug("%s cmd timeout has expired\n", udev->name);
 
-	schedule_work(&tcmu_unmap_work);
+	spin_lock(&timed_out_udevs_lock);
+	if (list_empty(&udev->timedout_entry))
+		list_add_tail(&udev->timedout_entry, &timed_out_udevs);
+	spin_unlock(&timed_out_udevs_lock);
 
-	/*
-	 * We don't need to wakeup threads on wait_cmdr since they have their
-	 * own timeout.
-	 */
+	schedule_work(&tcmu_unmap_work);
 }
 
 static int tcmu_attach_hba(struct se_hba *hba, u32 host_id)
@@ -1112,6 +1113,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	init_waitqueue_head(&udev->wait_cmdr);
 	mutex_init(&udev->cmdr_lock);
 
+	INIT_LIST_HEAD(&udev->timedout_entry);
 	idr_init(&udev->commands);
 	spin_lock_init(&udev->commands_lock);
 
@@ -1325,6 +1327,11 @@ static void tcmu_dev_kref_release(struct kref *kref)
 	vfree(udev->mb_addr);
 	udev->mb_addr = NULL;
 
+	spin_lock_bh(&timed_out_udevs_lock);
+	if (!list_empty(&udev->timedout_entry))
+		list_del(&udev->timedout_entry);
+	spin_unlock_bh(&timed_out_udevs_lock);
+
 	/* Upper layer should drain all requests before calling this */
 	spin_lock_irq(&udev->commands_lock);
 	idr_for_each_entry(&udev->commands, cmd, i) {
@@ -2041,8 +2048,31 @@ static void run_cmdr_queues(void)
 	mutex_unlock(&root_udev_mutex);
 }
 
+static void check_timedout_devices(void)
+{
+	struct tcmu_dev *udev, *tmp_dev;
+	LIST_HEAD(devs);
+
+	spin_lock_bh(&timed_out_udevs_lock);
+	list_splice_init(&timed_out_udevs, &devs);
+
+	list_for_each_entry_safe(udev, tmp_dev, &devs, timedout_entry) {
+		list_del_init(&udev->timedout_entry);
+		spin_unlock_bh(&timed_out_udevs_lock);
+
+		spin_lock(&udev->commands_lock);
+		idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
+		spin_unlock(&udev->commands_lock);
+
+		spin_lock_bh(&timed_out_udevs_lock);
+	}
+
+	spin_unlock_bh(&timed_out_udevs_lock);
+}
+
 static void tcmu_unmap_work_fn(struct work_struct *work)
 {
+	check_timedout_devices();
 	find_free_blocks();
 	run_cmdr_queues();
 }

From 6fddcb775477bb2213bd76ab62145645eb570f33 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:32 -0600
Subject: [PATCH 113/676] tcmu: remove commands_lock

No need for the commands_lock. The cmdr_lock is already held during
idr addition and deletion, so just grab it during traversal.

Note: This also fixes a issue where we should have been using at
least _bh locking in tcmu_handle_completions when taking the commands
lock to prevent the case where tcmu_handle_completions could be
interrupted by a timer softirq while the commands_lock is held.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 2ccc8e61449bf..43583a792439e 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -139,7 +139,6 @@ struct tcmu_dev {
 	struct radix_tree_root data_blocks;
 
 	struct idr commands;
-	spinlock_t commands_lock;
 
 	struct timer_list timeout;
 	unsigned int cmd_time_out;
@@ -1014,10 +1013,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 		}
 		WARN_ON(tcmu_hdr_get_op(entry->hdr.len_op) != TCMU_OP_CMD);
 
-		spin_lock(&udev->commands_lock);
 		cmd = idr_remove(&udev->commands, entry->hdr.cmd_id);
-		spin_unlock(&udev->commands_lock);
-
 		if (!cmd) {
 			pr_err("cmd_id not found, ring is broken\n");
 			set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
@@ -1115,7 +1111,6 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 
 	INIT_LIST_HEAD(&udev->timedout_entry);
 	idr_init(&udev->commands);
-	spin_lock_init(&udev->commands_lock);
 
 	timer_setup(&udev->timeout, tcmu_device_timedout, 0);
 
@@ -1333,16 +1328,14 @@ static void tcmu_dev_kref_release(struct kref *kref)
 	spin_unlock_bh(&timed_out_udevs_lock);
 
 	/* Upper layer should drain all requests before calling this */
-	spin_lock_irq(&udev->commands_lock);
+	mutex_lock(&udev->cmdr_lock);
 	idr_for_each_entry(&udev->commands, cmd, i) {
 		if (tcmu_check_and_free_pending_cmd(cmd) != 0)
 			all_expired = false;
 	}
 	idr_destroy(&udev->commands);
-	spin_unlock_irq(&udev->commands_lock);
 	WARN_ON(!all_expired);
 
-	mutex_lock(&udev->cmdr_lock);
 	tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1);
 	mutex_unlock(&udev->cmdr_lock);
 
@@ -2060,9 +2053,9 @@ static void check_timedout_devices(void)
 		list_del_init(&udev->timedout_entry);
 		spin_unlock_bh(&timed_out_udevs_lock);
 
-		spin_lock(&udev->commands_lock);
+		mutex_lock(&udev->cmdr_lock);
 		idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
-		spin_unlock(&udev->commands_lock);
+		mutex_unlock(&udev->cmdr_lock);
 
 		spin_lock_bh(&timed_out_udevs_lock);
 	}

From 810b8153c4243d2012a6ec002ddd3bbc9a9ae8c2 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:33 -0600
Subject: [PATCH 114/676] tcmu: release blocks for partially setup cmds

If we cannot setup a cmd because we run out of ring space
or global pages release the blocks before sleeping. This
prevents a deadlock where dev0 has waiting_blocks set and
needs N blocks, but dev1 to devX have each allocated N / X blocks
and also hit the global block limit so they went to sleep.

find_free_blocks is not able to take the sleeping dev's
blocks becaause their waiting_blocks is set and even
if it was not the block returned by find_last_bit could equal
dbi_max. The latter will probably never happen because
DATA_BLOCK_BITS is so high but in the next patches
DATA_BLOCK_BITS and TCMU_GLOBAL_MAX_BLOCKS will be settable so
it might be lower and could happen.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 43583a792439e..c7541f090453d 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -807,6 +807,13 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 		int ret;
 		DEFINE_WAIT(__wait);
 
+		/*
+		 * Don't leave commands partially setup because the unmap
+		 * thread might need the blocks to make forward progress.
+		 */
+		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
+		tcmu_cmd_reset_dbi_cur(tcmu_cmd);
+
 		prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
 
 		pr_debug("sleeping for ring space\n");

From 1a1fc0b88e9019cb3b2f291bdcb2d03d38614690 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:34 -0600
Subject: [PATCH 115/676] tcmu: simplify scatter_data_area error handling

scatter_data_area always returns 0, so stop checking
for errors.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index c7541f090453d..965f462eaa22d 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -520,7 +520,7 @@ static inline size_t iov_tail(struct iovec *iov)
 	return (size_t)iov->iov_base + iov->iov_len;
 }
 
-static int scatter_data_area(struct tcmu_dev *udev,
+static void scatter_data_area(struct tcmu_dev *udev,
 	struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
 	unsigned int data_nents, struct iovec **iov,
 	int *iov_cnt, bool copy_data)
@@ -573,8 +573,6 @@ static int scatter_data_area(struct tcmu_dev *udev,
 	}
 	if (to)
 		kunmap_atomic(to);
-
-	return 0;
 }
 
 static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
@@ -864,33 +862,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 	iov_cnt = 0;
 	copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
 		|| se_cmd->se_cmd_flags & SCF_BIDI);
-	ret = scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
-				se_cmd->t_data_nents, &iov, &iov_cnt,
-				copy_to_data_area);
-	if (ret) {
-		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
-		mutex_unlock(&udev->cmdr_lock);
-
-		pr_err("tcmu: alloc and scatter data failed\n");
-		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-	}
+	scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
+			  se_cmd->t_data_nents, &iov, &iov_cnt,
+			  copy_to_data_area);
 	entry->req.iov_cnt = iov_cnt;
 
 	/* Handle BIDI commands */
 	iov_cnt = 0;
 	if (se_cmd->se_cmd_flags & SCF_BIDI) {
 		iov++;
-		ret = scatter_data_area(udev, tcmu_cmd,
-					se_cmd->t_bidi_data_sg,
-					se_cmd->t_bidi_data_nents,
-					&iov, &iov_cnt, false);
-		if (ret) {
-			tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
-			mutex_unlock(&udev->cmdr_lock);
-
-			pr_err("tcmu: alloc and scatter bidi data failed\n");
-			return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-		}
+		scatter_data_area(udev, tcmu_cmd, se_cmd->t_bidi_data_sg,
+				  se_cmd->t_bidi_data_nents, &iov, &iov_cnt,
+				  false);
 	}
 	entry->req.iov_bidi_cnt = iov_cnt;
 

From 3c0f26ff9d040c6193b33689bbc03103854dba4d Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:35 -0600
Subject: [PATCH 116/676] tcmu: fix free block calculation

The blocks_left calculation does not account for free blocks
between 0 and thresh, so we could be queueing/waiting when
there are enough blocks free.

This has us add in the blocks between 0 and thresh as well as
at the end from thresh to DATA_BLOCK_BITS.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 965f462eaa22d..5d1daea510790 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -637,7 +637,7 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 
 static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
 {
-	return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh));
+	return thresh - bitmap_weight(bitmap, thresh);
 }
 
 /*
@@ -677,8 +677,9 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 
 	/* try to check and get the data blocks as needed */
 	space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
-	if (space < data_needed) {
-		unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh;
+	if ((space * DATA_BLOCK_SIZE) < data_needed) {
+		unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh +
+						space;
 		unsigned long grow;
 
 		if (blocks_left < blocks_needed) {

From 3e60913579b2fefa74eeb3269426e864f4afa7e7 Mon Sep 17 00:00:00 2001
From: Xiubo Li <lixiubo@cmss.chinamobile.com>
Date: Tue, 28 Nov 2017 12:40:36 -0600
Subject: [PATCH 117/676] tcmu: clean up the scatter helper

Add some comments to make the scatter code to be more readable,
and drop unused arg to new_iov.

Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com>
Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 5d1daea510790..8d0dc471fce80 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -492,8 +492,7 @@ static inline size_t head_to_end(size_t head, size_t size)
 	return size - head;
 }
 
-static inline void new_iov(struct iovec **iov, int *iov_cnt,
-			   struct tcmu_dev *udev)
+static inline void new_iov(struct iovec **iov, int *iov_cnt)
 {
 	struct iovec *iovec;
 
@@ -546,19 +545,38 @@ static void scatter_data_area(struct tcmu_dev *udev,
 				to = kmap_atomic(page);
 			}
 
-			copy_bytes = min_t(size_t, sg_remaining,
-					block_remaining);
+			/*
+			 * Covert to virtual offset of the ring data area.
+			 */
 			to_offset = get_block_offset_user(udev, dbi,
 					block_remaining);
 
+			/*
+			 * The following code will gather and map the blocks
+			 * to the same iovec when the blocks are all next to
+			 * each other.
+			 */
+			copy_bytes = min_t(size_t, sg_remaining,
+					block_remaining);
 			if (*iov_cnt != 0 &&
 			    to_offset == iov_tail(*iov)) {
+				/*
+				 * Will append to the current iovec, because
+				 * the current block page is next to the
+				 * previous one.
+				 */
 				(*iov)->iov_len += copy_bytes;
 			} else {
-				new_iov(iov, iov_cnt, udev);
+				/*
+				 * Will allocate a new iovec because we are
+				 * first time here or the current block page
+				 * is not next to the previous one.
+				 */
+				new_iov(iov, iov_cnt);
 				(*iov)->iov_base = (void __user *)to_offset;
 				(*iov)->iov_len = copy_bytes;
 			}
+
 			if (copy_data) {
 				offset = DATA_BLOCK_SIZE - block_remaining;
 				memcpy(to + offset,
@@ -566,11 +584,13 @@ static void scatter_data_area(struct tcmu_dev *udev,
 				       copy_bytes);
 				tcmu_flush_dcache_range(to, copy_bytes);
 			}
+
 			sg_remaining -= copy_bytes;
 			block_remaining -= copy_bytes;
 		}
 		kunmap_atomic(from - sg->offset);
 	}
+
 	if (to)
 		kunmap_atomic(to);
 }

From 6fd0ce79724dabe2cd0bd8aed111cbe94755bf88 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:37 -0600
Subject: [PATCH 118/676] tcmu: prep queue_cmd_ring to be used by unmap wq

In the next patches we will call queue_cmd_ring from the submitting
context and also the completion path. This changes the queue_cmd_ring
return code so in the next patches we can return a sense_reason_t
and also signal if a command was requeued.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 42 ++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 8d0dc471fce80..68d1d7214eeb7 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -776,8 +776,16 @@ static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
 	return 0;
 }
 
-static sense_reason_t
-tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
+/**
+ * queue_cmd_ring - queue cmd to ring or internally
+ * @tcmu_cmd: cmd to queue
+ * @scsi_err: TCM error code if failure (-1) returned.
+ *
+ * Returns:
+ * -1 we cannot queue internally or to the ring.
+ *  0 success
+ */
+static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 {
 	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
 	struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
@@ -791,8 +799,12 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 	bool copy_to_data_area;
 	size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
 
-	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags))
-		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+	*scsi_err = TCM_NO_SENSE;
+
+	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) {
+		*scsi_err = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+		return -1;
+	}
 
 	/*
 	 * Must be a certain minimum size for response sense info, but
@@ -819,7 +831,8 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 			"cmd ring/data area\n", command_size, data_length,
 			udev->cmdr_size, udev->data_size);
 		mutex_unlock(&udev->cmdr_lock);
-		return TCM_INVALID_CDB_FIELD;
+		*scsi_err = TCM_INVALID_CDB_FIELD;
+		return -1;
 	}
 
 	while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
@@ -845,7 +858,8 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 		finish_wait(&udev->wait_cmdr, &__wait);
 		if (!ret) {
 			pr_warn("tcmu: command timed out\n");
-			return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+			*scsi_err = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+			return -1;
 		}
 
 		mutex_lock(&udev->cmdr_lock);
@@ -902,7 +916,9 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 	if (ret) {
 		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
 		mutex_unlock(&udev->cmdr_lock);
-		return TCM_OUT_OF_RESOURCES;
+
+		*scsi_err = TCM_OUT_OF_RESOURCES;
+		return -1;
 	}
 	entry->hdr.cmd_id = tcmu_cmd->cmd_id;
 
@@ -933,27 +949,23 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
 		mod_timer(&udev->timeout, round_jiffies_up(jiffies +
 			  msecs_to_jiffies(udev->cmd_time_out)));
 
-	return TCM_NO_SENSE;
+	return 0;
 }
 
 static sense_reason_t
 tcmu_queue_cmd(struct se_cmd *se_cmd)
 {
 	struct tcmu_cmd *tcmu_cmd;
-	sense_reason_t ret;
+	sense_reason_t scsi_ret;
 
 	tcmu_cmd = tcmu_alloc_cmd(se_cmd);
 	if (!tcmu_cmd)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-	ret = tcmu_queue_cmd_ring(tcmu_cmd);
-	if (ret != TCM_NO_SENSE) {
-		pr_err("TCMU: Could not queue command\n");
-
+	if (queue_cmd_ring(tcmu_cmd, &scsi_ret) < 0)
 		tcmu_free_cmd(tcmu_cmd);
-	}
 
-	return ret;
+	return scsi_ret;
 }
 
 static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry)

From f890f5799a6628fe006ae524e625900186074cdb Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:38 -0600
Subject: [PATCH 119/676] tcmu: simplify dbi thresh handling

We do not really save a lot by trying to increase thresh
a multiple of the existing value. This just simplifies the
code by increasing it to whatever is needed for the command
being executed.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 68d1d7214eeb7..2679e4dcd0f18 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -79,7 +79,6 @@
 #define DATA_BLOCK_SIZE PAGE_SIZE
 #define DATA_BLOCK_BITS (256 * 1024)
 #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
-#define DATA_BLOCK_INIT_BITS 128
 
 /* The total size of the ring is 8M + 256K * PAGE_SIZE */
 #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
@@ -700,7 +699,6 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 	if ((space * DATA_BLOCK_SIZE) < data_needed) {
 		unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh +
 						space;
-		unsigned long grow;
 
 		if (blocks_left < blocks_needed) {
 			pr_debug("no data space: only %lu available, but ask for %zu\n",
@@ -709,23 +707,9 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 			return false;
 		}
 
-		/* Try to expand the thresh */
-		if (!udev->dbi_thresh) {
-			/* From idle state */
-			uint32_t init_thresh = DATA_BLOCK_INIT_BITS;
-
-			udev->dbi_thresh = max(blocks_needed, init_thresh);
-		} else {
-			/*
-			 * Grow the data area by max(blocks needed,
-			 * dbi_thresh / 2), but limited to the max
-			 * DATA_BLOCK_BITS size.
-			 */
-			grow = max(blocks_needed, udev->dbi_thresh / 2);
-			udev->dbi_thresh += grow;
-			if (udev->dbi_thresh > DATA_BLOCK_BITS)
-				udev->dbi_thresh = DATA_BLOCK_BITS;
-		}
+		udev->dbi_thresh += blocks_needed;
+		if (udev->dbi_thresh > DATA_BLOCK_BITS)
+			udev->dbi_thresh = DATA_BLOCK_BITS;
 	}
 
 	return tcmu_get_empty_blocks(udev, cmd);

From af1dd7ff46824a94da1d90443bd07db2796bd545 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:39 -0600
Subject: [PATCH 120/676] tcmu: don't block submitting context for block waits

This patch has tcmu internally queue cmds if its ring buffer
is full. It also makes the TCMU_GLOBAL_MAX_BLOCKS limit a
hint instead of a hard limit, so we do not have to add any
new locks/atomics in the main IO path except when IO is not
running.

This fixes the following bugs:

1. We cannot sleep from the submitting context because it might be
called from a target recv context. This results in transport level
commands timing out. For example if the ring is full, we would
sleep, and a iscsi initiator would send a iscsi ping/nop which
times out because the target's recv thread is sleeping here.

2. Devices were not fairly scheduled to run when they hit the global
limit so they could time out waiting for ring space while others
got run.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 264 +++++++++++++++++++-----------
 1 file changed, 169 insertions(+), 95 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 2679e4dcd0f18..52fc1d440d237 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -83,7 +83,10 @@
 /* The total size of the ring is 8M + 256K * PAGE_SIZE */
 #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
 
-/* Default maximum of the global data blocks(512K * PAGE_SIZE) */
+/*
+ * Default number of global data blocks(512K * PAGE_SIZE)
+ * when the unmap thread will be started.
+ */
 #define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
 
 static u8 tcmu_kern_cmd_reply_supported;
@@ -106,6 +109,7 @@ struct tcmu_nl_cmd {
 struct tcmu_dev {
 	struct list_head node;
 	struct kref kref;
+
 	struct se_device se_dev;
 
 	char *name;
@@ -128,10 +132,9 @@ struct tcmu_dev {
 	size_t data_off;
 	size_t data_size;
 
-	wait_queue_head_t wait_cmdr;
 	struct mutex cmdr_lock;
+	struct list_head cmdr_queue;
 
-	bool waiting_global;
 	uint32_t dbi_max;
 	uint32_t dbi_thresh;
 	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
@@ -160,6 +163,7 @@ struct tcmu_dev {
 struct tcmu_cmd {
 	struct se_cmd *se_cmd;
 	struct tcmu_dev *tcmu_dev;
+	struct list_head cmdr_queue_entry;
 
 	uint16_t cmd_id;
 
@@ -174,7 +178,16 @@ struct tcmu_cmd {
 #define TCMU_CMD_BIT_EXPIRED 0
 	unsigned long flags;
 };
-
+/*
+ * To avoid dead lock the mutex lock order should always be:
+ *
+ * mutex_lock(&root_udev_mutex);
+ * ...
+ * mutex_lock(&tcmu_dev->cmdr_lock);
+ * mutex_unlock(&tcmu_dev->cmdr_lock);
+ * ...
+ * mutex_unlock(&root_udev_mutex);
+ */
 static DEFINE_MUTEX(root_udev_mutex);
 static LIST_HEAD(root_udev);
 
@@ -182,7 +195,7 @@ static DEFINE_SPINLOCK(timed_out_udevs_lock);
 static LIST_HEAD(timed_out_udevs);
 
 static atomic_t global_db_count = ATOMIC_INIT(0);
-static struct work_struct tcmu_unmap_work;
+static struct delayed_work tcmu_unmap_work;
 
 static struct kmem_cache *tcmu_cmd_cache;
 
@@ -346,10 +359,8 @@ static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
 	page = radix_tree_lookup(&udev->data_blocks, dbi);
 	if (!page) {
 		if (atomic_add_return(1, &global_db_count) >
-					TCMU_GLOBAL_MAX_BLOCKS) {
-			atomic_dec(&global_db_count);
-			return false;
-		}
+					TCMU_GLOBAL_MAX_BLOCKS)
+			schedule_delayed_work(&tcmu_unmap_work, 0);
 
 		/* try to get new page from the mm */
 		page = alloc_page(GFP_KERNEL);
@@ -380,18 +391,11 @@ static bool tcmu_get_empty_blocks(struct tcmu_dev *udev,
 {
 	int i;
 
-	udev->waiting_global = false;
-
 	for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) {
 		if (!tcmu_get_empty_block(udev, tcmu_cmd))
-			goto err;
+			return false;
 	}
 	return true;
-
-err:
-	udev->waiting_global = true;
-	schedule_work(&tcmu_unmap_work);
-	return false;
 }
 
 static inline struct page *
@@ -437,6 +441,7 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 	if (!tcmu_cmd)
 		return NULL;
 
+	INIT_LIST_HEAD(&tcmu_cmd->cmdr_queue_entry);
 	tcmu_cmd->se_cmd = se_cmd;
 	tcmu_cmd->tcmu_dev = udev;
 
@@ -742,6 +747,10 @@ static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
 	unsigned long tmo = udev->cmd_time_out;
 	int cmd_id;
 
+	/*
+	 * If it was on the cmdr queue waiting we do not reset the timer
+	 * for requeues and when it is finally sent to userspace.
+	 */
 	if (tcmu_cmd->cmd_id)
 		return 0;
 
@@ -753,13 +762,31 @@ static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
 	tcmu_cmd->cmd_id = cmd_id;
 
 	if (!tmo)
-		return 0;
+		tmo = TCMU_TIME_OUT;
+
+	pr_debug("allocated cmd %u for dev %s tmo %lu\n", tcmu_cmd->cmd_id,
+		 udev->name, tmo / MSEC_PER_SEC);
 
 	tcmu_cmd->deadline = round_jiffies_up(jiffies + msecs_to_jiffies(tmo));
 	mod_timer(&udev->timeout, tcmu_cmd->deadline);
 	return 0;
 }
 
+static int add_to_cmdr_queue(struct tcmu_cmd *tcmu_cmd)
+{
+	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
+	int ret;
+
+	ret = tcmu_setup_cmd_timer(tcmu_cmd);
+	if (ret)
+		return ret;
+
+	list_add_tail(&tcmu_cmd->cmdr_queue_entry, &udev->cmdr_queue);
+	pr_debug("adding cmd %u on dev %s to ring space wait queue\n",
+		 tcmu_cmd->cmd_id, udev->name);
+	return 0;
+}
+
 /**
  * queue_cmd_ring - queue cmd to ring or internally
  * @tcmu_cmd: cmd to queue
@@ -768,6 +795,7 @@ static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
  * Returns:
  * -1 we cannot queue internally or to the ring.
  *  0 success
+ *  1 internally queued to wait for ring memory to free.
  */
 static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 {
@@ -805,7 +833,8 @@ static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 	base_command_size = tcmu_cmd_get_base_cmd_size(tcmu_cmd->dbi_cnt);
 	command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size);
 
-	mutex_lock(&udev->cmdr_lock);
+	if (!list_empty(&udev->cmdr_queue))
+		goto queue;
 
 	mb = udev->mb_addr;
 	cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
@@ -814,42 +843,18 @@ static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 		pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu "
 			"cmd ring/data area\n", command_size, data_length,
 			udev->cmdr_size, udev->data_size);
-		mutex_unlock(&udev->cmdr_lock);
 		*scsi_err = TCM_INVALID_CDB_FIELD;
 		return -1;
 	}
 
-	while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
-		int ret;
-		DEFINE_WAIT(__wait);
-
+	if (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
 		/*
 		 * Don't leave commands partially setup because the unmap
 		 * thread might need the blocks to make forward progress.
 		 */
 		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
 		tcmu_cmd_reset_dbi_cur(tcmu_cmd);
-
-		prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
-
-		pr_debug("sleeping for ring space\n");
-		mutex_unlock(&udev->cmdr_lock);
-		if (udev->cmd_time_out)
-			ret = schedule_timeout(
-					msecs_to_jiffies(udev->cmd_time_out));
-		else
-			ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT));
-		finish_wait(&udev->wait_cmdr, &__wait);
-		if (!ret) {
-			pr_warn("tcmu: command timed out\n");
-			*scsi_err = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-			return -1;
-		}
-
-		mutex_lock(&udev->cmdr_lock);
-
-		/* We dropped cmdr_lock, cmd_head is stale */
-		cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
+		goto queue;
 	}
 
 	/* Insert a PAD if end-of-ring space is too small */
@@ -924,31 +929,39 @@ static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 
 	UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size);
 	tcmu_flush_dcache_range(mb, sizeof(*mb));
-	mutex_unlock(&udev->cmdr_lock);
 
 	/* TODO: only if FLUSH and FUA? */
 	uio_event_notify(&udev->uio_info);
 
-	if (udev->cmd_time_out)
-		mod_timer(&udev->timeout, round_jiffies_up(jiffies +
-			  msecs_to_jiffies(udev->cmd_time_out)));
-
 	return 0;
+
+queue:
+	if (add_to_cmdr_queue(tcmu_cmd)) {
+		*scsi_err = TCM_OUT_OF_RESOURCES;
+		return -1;
+	}
+
+	return 1;
 }
 
 static sense_reason_t
 tcmu_queue_cmd(struct se_cmd *se_cmd)
 {
+	struct se_device *se_dev = se_cmd->se_dev;
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
 	struct tcmu_cmd *tcmu_cmd;
 	sense_reason_t scsi_ret;
+	int ret;
 
 	tcmu_cmd = tcmu_alloc_cmd(se_cmd);
 	if (!tcmu_cmd)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-	if (queue_cmd_ring(tcmu_cmd, &scsi_ret) < 0)
+	mutex_lock(&udev->cmdr_lock);
+	ret = queue_cmd_ring(tcmu_cmd, &scsi_ret);
+	mutex_unlock(&udev->cmdr_lock);
+	if (ret < 0)
 		tcmu_free_cmd(tcmu_cmd);
-
 	return scsi_ret;
 }
 
@@ -1036,10 +1049,15 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 		handled++;
 	}
 
-	if (mb->cmd_tail == mb->cmd_head)
-		del_timer(&udev->timeout); /* no more pending cmds */
-
-	wake_up(&udev->wait_cmdr);
+	if (mb->cmd_tail == mb->cmd_head && list_empty(&udev->cmdr_queue)) {
+		del_timer(&udev->timeout);
+		/*
+		 * not more pending or waiting commands so try to reclaim
+		 * blocks if needed.
+		 */
+		if (atomic_read(&global_db_count) > TCMU_GLOBAL_MAX_BLOCKS)
+			schedule_delayed_work(&tcmu_unmap_work, 0);
+	}
 
 	return handled;
 }
@@ -1047,6 +1065,10 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 static int tcmu_check_expired_cmd(int id, void *p, void *data)
 {
 	struct tcmu_cmd *cmd = p;
+	struct tcmu_dev *udev = cmd->tcmu_dev;
+	u8 scsi_status;
+	struct se_cmd *se_cmd;
+	bool is_running;
 
 	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags))
 		return 0;
@@ -1054,10 +1076,27 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 	if (!time_after(jiffies, cmd->deadline))
 		return 0;
 
-	set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags);
-	target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION);
+	is_running = list_empty(&cmd->cmdr_queue_entry);
+	pr_debug("Timing out cmd %u on dev %s that is %s.\n",
+		 id, udev->name, is_running ? "inflight" : "queued");
+
+	se_cmd = cmd->se_cmd;
 	cmd->se_cmd = NULL;
 
+	if (is_running) {
+		set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags);
+		/*
+		 * target_complete_cmd will translate this to LUN COMM FAILURE
+		 */
+		scsi_status = SAM_STAT_CHECK_CONDITION;
+	} else {
+		list_del_init(&cmd->cmdr_queue_entry);
+
+		idr_remove(&udev->commands, id);
+		tcmu_free_cmd(cmd);
+		scsi_status = SAM_STAT_TASK_SET_FULL;
+	}
+	target_complete_cmd(se_cmd, scsi_status);
 	return 0;
 }
 
@@ -1072,7 +1111,7 @@ static void tcmu_device_timedout(struct timer_list *t)
 		list_add_tail(&udev->timedout_entry, &timed_out_udevs);
 	spin_unlock(&timed_out_udevs_lock);
 
-	schedule_work(&tcmu_unmap_work);
+	schedule_delayed_work(&tcmu_unmap_work, 0);
 }
 
 static int tcmu_attach_hba(struct se_hba *hba, u32 host_id)
@@ -1113,10 +1152,10 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	udev->hba = hba;
 	udev->cmd_time_out = TCMU_TIME_OUT;
 
-	init_waitqueue_head(&udev->wait_cmdr);
 	mutex_init(&udev->cmdr_lock);
 
 	INIT_LIST_HEAD(&udev->timedout_entry);
+	INIT_LIST_HEAD(&udev->cmdr_queue);
 	idr_init(&udev->commands);
 
 	timer_setup(&udev->timeout, tcmu_device_timedout, 0);
@@ -1129,13 +1168,63 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	return &udev->se_dev;
 }
 
+static bool run_cmdr_queue(struct tcmu_dev *udev)
+{
+	struct tcmu_cmd *tcmu_cmd, *tmp_cmd;
+	LIST_HEAD(cmds);
+	bool drained = true;
+	sense_reason_t scsi_ret;
+	int ret;
+
+	if (list_empty(&udev->cmdr_queue))
+		return true;
+
+	pr_debug("running %s's cmdr queue\n", udev->name);
+
+	list_splice_init(&udev->cmdr_queue, &cmds);
+
+	list_for_each_entry_safe(tcmu_cmd, tmp_cmd, &cmds, cmdr_queue_entry) {
+		list_del_init(&tcmu_cmd->cmdr_queue_entry);
+
+	        pr_debug("removing cmd %u on dev %s from queue\n",
+		         tcmu_cmd->cmd_id, udev->name);
+
+		ret = queue_cmd_ring(tcmu_cmd, &scsi_ret);
+		if (ret < 0) {
+		        pr_debug("cmd %u on dev %s failed with %u\n",
+			         tcmu_cmd->cmd_id, udev->name, scsi_ret);
+
+			idr_remove(&udev->commands, tcmu_cmd->cmd_id);
+			/*
+			 * Ignore scsi_ret for now. target_complete_cmd
+			 * drops it.
+			 */
+			target_complete_cmd(tcmu_cmd->se_cmd,
+					    SAM_STAT_CHECK_CONDITION);
+			tcmu_free_cmd(tcmu_cmd);
+		} else if (ret > 0) {
+			pr_debug("ran out of space during cmdr queue run\n");
+			/*
+			 * cmd was requeued, so just put all cmds back in
+			 * the queue
+			 */
+			list_splice_tail(&cmds, &udev->cmdr_queue);
+			drained = false;
+			goto done;
+		}
+	}
+done:
+	return drained;
+}
+
 static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
 {
-	struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info);
+	struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info);
 
-	mutex_lock(&tcmu_dev->cmdr_lock);
-	tcmu_handle_completions(tcmu_dev);
-	mutex_unlock(&tcmu_dev->cmdr_lock);
+	mutex_lock(&udev->cmdr_lock);
+	tcmu_handle_completions(udev);
+	run_cmdr_queue(udev);
+	mutex_unlock(&udev->cmdr_lock);
 
 	return 0;
 }
@@ -1531,7 +1620,6 @@ static int tcmu_configure_device(struct se_device *dev)
 	udev->data_off = CMDR_SIZE;
 	udev->data_size = DATA_SIZE;
 	udev->dbi_thresh = 0; /* Default in Idle state */
-	udev->waiting_global = false;
 
 	/* Initialise the mailbox of the ring buffer */
 	mb = udev->mb_addr;
@@ -1977,12 +2065,14 @@ static struct target_backend_ops tcmu_ops = {
 	.tb_dev_attrib_attrs	= NULL,
 };
 
-
 static void find_free_blocks(void)
 {
 	struct tcmu_dev *udev;
 	loff_t off;
-	uint32_t start, end, block;
+	u32 start, end, block, total_freed = 0;
+
+	if (atomic_read(&global_db_count) <= TCMU_GLOBAL_MAX_BLOCKS)
+		return;
 
 	mutex_lock(&root_udev_mutex);
 	list_for_each_entry(udev, &root_udev, node) {
@@ -1991,8 +2081,8 @@ static void find_free_blocks(void)
 		/* Try to complete the finished commands first */
 		tcmu_handle_completions(udev);
 
-		/* Skip the udevs waiting the global pool or in idle */
-		if (udev->waiting_global || !udev->dbi_thresh) {
+		/* Skip the udevs in idle */
+		if (!udev->dbi_thresh) {
 			mutex_unlock(&udev->cmdr_lock);
 			continue;
 		}
@@ -2001,8 +2091,8 @@ static void find_free_blocks(void)
 		block = find_last_bit(udev->data_bitmap, end);
 		if (block == udev->dbi_max) {
 			/*
-			 * The last bit is dbi_max, so there is
-			 * no need to shrink any blocks.
+			 * The last bit is dbi_max, so it is not possible
+			 * reclaim any blocks.
 			 */
 			mutex_unlock(&udev->cmdr_lock);
 			continue;
@@ -2022,30 +2112,15 @@ static void find_free_blocks(void)
 		/* Release the block pages */
 		tcmu_blocks_release(&udev->data_blocks, start, end);
 		mutex_unlock(&udev->cmdr_lock);
-	}
-	mutex_unlock(&root_udev_mutex);
-}
 
-static void run_cmdr_queues(void)
-{
-	struct tcmu_dev *udev;
-
-	/*
-	 * Try to wake up the udevs who are waiting
-	 * for the global data block pool.
-	 */
-	mutex_lock(&root_udev_mutex);
-	list_for_each_entry(udev, &root_udev, node) {
-		mutex_lock(&udev->cmdr_lock);
-		if (!udev->waiting_global) {
-			mutex_unlock(&udev->cmdr_lock);
-			break;
-		}
-		mutex_unlock(&udev->cmdr_lock);
-
-		wake_up(&udev->wait_cmdr);
+		total_freed += end - start;
+		pr_debug("Freed %u blocks (total %u) from %s.\n", end - start,
+			 total_freed, udev->name);
 	}
 	mutex_unlock(&root_udev_mutex);
+
+	if (atomic_read(&global_db_count) > TCMU_GLOBAL_MAX_BLOCKS)
+		schedule_delayed_work(&tcmu_unmap_work, msecs_to_jiffies(5000));
 }
 
 static void check_timedout_devices(void)
@@ -2074,7 +2149,6 @@ static void tcmu_unmap_work_fn(struct work_struct *work)
 {
 	check_timedout_devices();
 	find_free_blocks();
-	run_cmdr_queues();
 }
 
 static int __init tcmu_module_init(void)
@@ -2083,7 +2157,7 @@ static int __init tcmu_module_init(void)
 
 	BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0);
 
-	INIT_WORK(&tcmu_unmap_work, tcmu_unmap_work_fn);
+	INIT_DELAYED_WORK(&tcmu_unmap_work, tcmu_unmap_work_fn);
 
 	tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache",
 				sizeof(struct tcmu_cmd),
@@ -2146,7 +2220,7 @@ static int __init tcmu_module_init(void)
 
 static void __exit tcmu_module_exit(void)
 {
-	cancel_work_sync(&tcmu_unmap_work);
+	cancel_delayed_work_sync(&tcmu_unmap_work);
 	target_backend_unregister(&tcmu_ops);
 	kfree(tcmu_attrs);
 	genl_unregister_family(&tcmu_genl_family);

From 9103575ae34e9d60d40940bebf47fc9e9652067a Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:40 -0600
Subject: [PATCH 121/676] tcmu: make ring buffer timer configurable

This adds a timer, qfull_time_out, that controls how long a
device will wait for ring buffer space to open before
failing the commands in the queue. It is useful to separate
this timer from the cmd_time_out and default 30 sec one,
because for HA setups cmd_time_out may be disbled and 30
seconds is too long to wait when some OSs like ESX will
timeout commands after as little as 8 - 15 seconds.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 149 +++++++++++++++++++++++-------
 1 file changed, 115 insertions(+), 34 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 52fc1d440d237..c6a0c3198ccc9 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -142,8 +142,12 @@ struct tcmu_dev {
 
 	struct idr commands;
 
-	struct timer_list timeout;
+	struct timer_list cmd_timer;
 	unsigned int cmd_time_out;
+
+	struct timer_list qfull_timer;
+	int qfull_time_out;
+
 	struct list_head timedout_entry;
 
 	spinlock_t nl_cmd_lock;
@@ -741,18 +745,14 @@ static inline size_t tcmu_cmd_get_cmd_size(struct tcmu_cmd *tcmu_cmd,
 	return command_size;
 }
 
-static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
+static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd, unsigned int tmo,
+				struct timer_list *timer)
 {
 	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
-	unsigned long tmo = udev->cmd_time_out;
 	int cmd_id;
 
-	/*
-	 * If it was on the cmdr queue waiting we do not reset the timer
-	 * for requeues and when it is finally sent to userspace.
-	 */
 	if (tcmu_cmd->cmd_id)
-		return 0;
+		goto setup_timer;
 
 	cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 1, USHRT_MAX, GFP_NOWAIT);
 	if (cmd_id < 0) {
@@ -761,23 +761,38 @@ static int tcmu_setup_cmd_timer(struct tcmu_cmd *tcmu_cmd)
 	}
 	tcmu_cmd->cmd_id = cmd_id;
 
-	if (!tmo)
-		tmo = TCMU_TIME_OUT;
-
 	pr_debug("allocated cmd %u for dev %s tmo %lu\n", tcmu_cmd->cmd_id,
 		 udev->name, tmo / MSEC_PER_SEC);
 
+setup_timer:
+	if (!tmo)
+		return 0;
+
 	tcmu_cmd->deadline = round_jiffies_up(jiffies + msecs_to_jiffies(tmo));
-	mod_timer(&udev->timeout, tcmu_cmd->deadline);
+	mod_timer(timer, tcmu_cmd->deadline);
 	return 0;
 }
 
 static int add_to_cmdr_queue(struct tcmu_cmd *tcmu_cmd)
 {
 	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
+	unsigned int tmo;
 	int ret;
 
-	ret = tcmu_setup_cmd_timer(tcmu_cmd);
+	/*
+	 * For backwards compat if qfull_time_out is not set use
+	 * cmd_time_out and if that's not set use the default time out.
+	 */
+	if (!udev->qfull_time_out)
+		return -ETIMEDOUT;
+	else if (udev->qfull_time_out > 0)
+		tmo = udev->qfull_time_out;
+	else if (udev->cmd_time_out)
+		tmo = udev->cmd_time_out;
+	else
+		tmo = TCMU_TIME_OUT;
+
+	ret = tcmu_setup_cmd_timer(tcmu_cmd, tmo, &udev->qfull_timer);
 	if (ret)
 		return ret;
 
@@ -901,7 +916,8 @@ static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 	}
 	entry->req.iov_bidi_cnt = iov_cnt;
 
-	ret = tcmu_setup_cmd_timer(tcmu_cmd);
+	ret = tcmu_setup_cmd_timer(tcmu_cmd, udev->cmd_time_out,
+				   &udev->cmd_timer);
 	if (ret) {
 		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
 		mutex_unlock(&udev->cmdr_lock);
@@ -1049,14 +1065,19 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 		handled++;
 	}
 
-	if (mb->cmd_tail == mb->cmd_head && list_empty(&udev->cmdr_queue)) {
-		del_timer(&udev->timeout);
-		/*
-		 * not more pending or waiting commands so try to reclaim
-		 * blocks if needed.
-		 */
-		if (atomic_read(&global_db_count) > TCMU_GLOBAL_MAX_BLOCKS)
-			schedule_delayed_work(&tcmu_unmap_work, 0);
+	if (mb->cmd_tail == mb->cmd_head) {
+		/* no more pending commands */
+		del_timer(&udev->cmd_timer);
+
+		if (list_empty(&udev->cmdr_queue)) {
+			/*
+			 * no more pending or waiting commands so try to
+			 * reclaim blocks if needed.
+			 */
+			if (atomic_read(&global_db_count) >
+			    TCMU_GLOBAL_MAX_BLOCKS)
+				schedule_delayed_work(&tcmu_unmap_work, 0);
+		}
 	}
 
 	return handled;
@@ -1077,13 +1098,15 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 		return 0;
 
 	is_running = list_empty(&cmd->cmdr_queue_entry);
-	pr_debug("Timing out cmd %u on dev %s that is %s.\n",
-		 id, udev->name, is_running ? "inflight" : "queued");
-
-	se_cmd = cmd->se_cmd;
-	cmd->se_cmd = NULL;
 
 	if (is_running) {
+		/*
+		 * If cmd_time_out is disabled but qfull is set deadline
+		 * will only reflect the qfull timeout. Ignore it.
+		 */
+		if (!udev->cmd_time_out)
+			return 0;
+
 		set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags);
 		/*
 		 * target_complete_cmd will translate this to LUN COMM FAILURE
@@ -1096,16 +1119,18 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 		tcmu_free_cmd(cmd);
 		scsi_status = SAM_STAT_TASK_SET_FULL;
 	}
+
+	pr_debug("Timing out cmd %u on dev %s that is %s.\n",
+		 id, udev->name, is_running ? "inflight" : "queued");
+
+	se_cmd = cmd->se_cmd;
+	cmd->se_cmd = NULL;
 	target_complete_cmd(se_cmd, scsi_status);
 	return 0;
 }
 
-static void tcmu_device_timedout(struct timer_list *t)
+static void tcmu_device_timedout(struct tcmu_dev *udev)
 {
-	struct tcmu_dev *udev = from_timer(udev, t, timeout);
-
-	pr_debug("%s cmd timeout has expired\n", udev->name);
-
 	spin_lock(&timed_out_udevs_lock);
 	if (list_empty(&udev->timedout_entry))
 		list_add_tail(&udev->timedout_entry, &timed_out_udevs);
@@ -1114,6 +1139,22 @@ static void tcmu_device_timedout(struct timer_list *t)
 	schedule_delayed_work(&tcmu_unmap_work, 0);
 }
 
+static void tcmu_cmd_timedout(struct timer_list *t)
+{
+	struct tcmu_dev *udev = from_timer(udev, t, cmd_timer);
+
+	pr_debug("%s cmd timeout has expired\n", udev->name);
+	tcmu_device_timedout(udev);
+}
+
+static void tcmu_qfull_timedout(struct timer_list *t)
+{
+	struct tcmu_dev *udev = from_timer(udev, t, qfull_timer);
+
+	pr_debug("%s qfull timeout has expired\n", udev->name);
+	tcmu_device_timedout(udev);
+}
+
 static int tcmu_attach_hba(struct se_hba *hba, u32 host_id)
 {
 	struct tcmu_hba *tcmu_hba;
@@ -1151,6 +1192,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 
 	udev->hba = hba;
 	udev->cmd_time_out = TCMU_TIME_OUT;
+	udev->qfull_time_out = -1;
 
 	mutex_init(&udev->cmdr_lock);
 
@@ -1158,7 +1200,8 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	INIT_LIST_HEAD(&udev->cmdr_queue);
 	idr_init(&udev->commands);
 
-	timer_setup(&udev->timeout, tcmu_device_timedout, 0);
+	timer_setup(&udev->qfull_timer, tcmu_qfull_timedout, 0);
+	timer_setup(&udev->cmd_timer, tcmu_cmd_timedout, 0);
 
 	init_waitqueue_head(&udev->nl_cmd_wq);
 	spin_lock_init(&udev->nl_cmd_lock);
@@ -1213,6 +1256,8 @@ static bool run_cmdr_queue(struct tcmu_dev *udev)
 			goto done;
 		}
 	}
+	if (list_empty(&udev->cmdr_queue))
+		del_timer(&udev->qfull_timer);
 done:
 	return drained;
 }
@@ -1712,7 +1757,8 @@ static void tcmu_destroy_device(struct se_device *dev)
 {
 	struct tcmu_dev *udev = TCMU_DEV(dev);
 
-	del_timer_sync(&udev->timeout);
+	del_timer_sync(&udev->cmd_timer);
+	del_timer_sync(&udev->qfull_timer);
 
 	mutex_lock(&root_udev_mutex);
 	list_del(&udev->node);
@@ -1893,6 +1939,40 @@ static ssize_t tcmu_cmd_time_out_store(struct config_item *item, const char *pag
 }
 CONFIGFS_ATTR(tcmu_, cmd_time_out);
 
+static ssize_t tcmu_qfull_time_out_show(struct config_item *item, char *page)
+{
+	struct se_dev_attrib *da = container_of(to_config_group(item),
+						struct se_dev_attrib, da_group);
+	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
+
+	return snprintf(page, PAGE_SIZE, "%ld\n", udev->qfull_time_out <= 0 ?
+			udev->qfull_time_out :
+			udev->qfull_time_out / MSEC_PER_SEC);
+}
+
+static ssize_t tcmu_qfull_time_out_store(struct config_item *item,
+					 const char *page, size_t count)
+{
+	struct se_dev_attrib *da = container_of(to_config_group(item),
+					struct se_dev_attrib, da_group);
+	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
+	s32 val;
+	int ret;
+
+	ret = kstrtos32(page, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val >= 0) {
+		udev->qfull_time_out = val * MSEC_PER_SEC;
+	} else {
+		printk(KERN_ERR "Invalid qfull timeout value %d\n", val);
+		return -EINVAL;
+	}
+	return count;
+}
+CONFIGFS_ATTR(tcmu_, qfull_time_out);
+
 static ssize_t tcmu_dev_config_show(struct config_item *item, char *page)
 {
 	struct se_dev_attrib *da = container_of(to_config_group(item),
@@ -2038,6 +2118,7 @@ CONFIGFS_ATTR(tcmu_, emulate_write_cache);
 
 static struct configfs_attribute *tcmu_attrib_attrs[] = {
 	&tcmu_attr_cmd_time_out,
+	&tcmu_attr_qfull_time_out,
 	&tcmu_attr_dev_config,
 	&tcmu_attr_dev_size,
 	&tcmu_attr_emulate_write_cache,

From 80eb876138a1adc7d30831ce275ea744c050d97e Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 28 Nov 2017 12:40:41 -0600
Subject: [PATCH 122/676] tcmu: allow max block and global max blocks to be
 settable

Users might have a physical system to a target so they could
have a lot more than 2 gigs of memory they want to devote to
tcmu. OTOH, we could be running in a vm and so a 2 gig
global and 1 gig per dev limit might be too high. This patch
allows the user to specify the limits.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 143 ++++++++++++++++++++++++++----
 1 file changed, 124 insertions(+), 19 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index c6a0c3198ccc9..bac08bc72e3b9 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -77,9 +77,13 @@
  * the total size is 256K * PAGE_SIZE.
  */
 #define DATA_BLOCK_SIZE PAGE_SIZE
-#define DATA_BLOCK_BITS (256 * 1024)
+#define DATA_BLOCK_SHIFT PAGE_SHIFT
+#define DATA_BLOCK_BITS_DEF (256 * 1024)
 #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
 
+#define TCMU_MBS_TO_BLOCKS(_mbs) (_mbs << (20 - DATA_BLOCK_SHIFT))
+#define TCMU_BLOCKS_TO_MBS(_blocks) (_blocks >> (20 - DATA_BLOCK_SHIFT))
+
 /* The total size of the ring is 8M + 256K * PAGE_SIZE */
 #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
 
@@ -87,7 +91,7 @@
  * Default number of global data blocks(512K * PAGE_SIZE)
  * when the unmap thread will be started.
  */
-#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
+#define TCMU_GLOBAL_MAX_BLOCKS_DEF (512 * 1024)
 
 static u8 tcmu_kern_cmd_reply_supported;
 
@@ -131,13 +135,15 @@ struct tcmu_dev {
 	/* Must add data_off and mb_addr to get the address */
 	size_t data_off;
 	size_t data_size;
+	uint32_t max_blocks;
+	size_t ring_size;
 
 	struct mutex cmdr_lock;
 	struct list_head cmdr_queue;
 
 	uint32_t dbi_max;
 	uint32_t dbi_thresh;
-	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
+	unsigned long *data_bitmap;
 	struct radix_tree_root data_blocks;
 
 	struct idr commands;
@@ -198,10 +204,51 @@ static LIST_HEAD(root_udev);
 static DEFINE_SPINLOCK(timed_out_udevs_lock);
 static LIST_HEAD(timed_out_udevs);
 
+static struct kmem_cache *tcmu_cmd_cache;
+
 static atomic_t global_db_count = ATOMIC_INIT(0);
 static struct delayed_work tcmu_unmap_work;
+static int tcmu_global_max_blocks = TCMU_GLOBAL_MAX_BLOCKS_DEF;
 
-static struct kmem_cache *tcmu_cmd_cache;
+static int tcmu_set_global_max_data_area(const char *str,
+					 const struct kernel_param *kp)
+{
+	int ret, max_area_mb;
+
+	ret = kstrtoint(str, 10, &max_area_mb);
+	if (ret)
+		return -EINVAL;
+
+	if (max_area_mb <= 0) {
+		pr_err("global_max_data_area must be larger than 0.\n");
+		return -EINVAL;
+	}
+
+	tcmu_global_max_blocks = TCMU_MBS_TO_BLOCKS(max_area_mb);
+	if (atomic_read(&global_db_count) > tcmu_global_max_blocks)
+		schedule_delayed_work(&tcmu_unmap_work, 0);
+	else
+		cancel_delayed_work_sync(&tcmu_unmap_work);
+
+	return 0;
+}
+
+static int tcmu_get_global_max_data_area(char *buffer,
+					 const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%d", TCMU_BLOCKS_TO_MBS(tcmu_global_max_blocks));
+}
+
+static const struct kernel_param_ops tcmu_global_max_data_area_op = {
+	.set = tcmu_set_global_max_data_area,
+	.get = tcmu_get_global_max_data_area,
+};
+
+module_param_cb(global_max_data_area_mb, &tcmu_global_max_data_area_op, NULL,
+		S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(global_max_data_area_mb,
+		 "Max MBs allowed to be allocated to all the tcmu device's "
+		 "data areas.");
 
 /* multicast group */
 enum tcmu_multicast_groups {
@@ -363,7 +410,7 @@ static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
 	page = radix_tree_lookup(&udev->data_blocks, dbi);
 	if (!page) {
 		if (atomic_add_return(1, &global_db_count) >
-					TCMU_GLOBAL_MAX_BLOCKS)
+				      tcmu_global_max_blocks)
 			schedule_delayed_work(&tcmu_unmap_work, 0);
 
 		/* try to get new page from the mm */
@@ -706,8 +753,8 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 	/* try to check and get the data blocks as needed */
 	space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
 	if ((space * DATA_BLOCK_SIZE) < data_needed) {
-		unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh +
-						space;
+		unsigned long blocks_left =
+				(udev->max_blocks - udev->dbi_thresh) + space;
 
 		if (blocks_left < blocks_needed) {
 			pr_debug("no data space: only %lu available, but ask for %zu\n",
@@ -717,8 +764,8 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 		}
 
 		udev->dbi_thresh += blocks_needed;
-		if (udev->dbi_thresh > DATA_BLOCK_BITS)
-			udev->dbi_thresh = DATA_BLOCK_BITS;
+		if (udev->dbi_thresh > udev->max_blocks)
+			udev->dbi_thresh = udev->max_blocks;
 	}
 
 	return tcmu_get_empty_blocks(udev, cmd);
@@ -1075,7 +1122,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 			 * reclaim blocks if needed.
 			 */
 			if (atomic_read(&global_db_count) >
-			    TCMU_GLOBAL_MAX_BLOCKS)
+			    tcmu_global_max_blocks)
 				schedule_delayed_work(&tcmu_unmap_work, 0);
 		}
 	}
@@ -1194,6 +1241,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	udev->cmd_time_out = TCMU_TIME_OUT;
 	udev->qfull_time_out = -1;
 
+	udev->max_blocks = DATA_BLOCK_BITS_DEF;
 	mutex_init(&udev->cmdr_lock);
 
 	INIT_LIST_HEAD(&udev->timedout_entry);
@@ -1335,7 +1383,7 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
 
 		/*
 		 * Since this case is rare in page fault routine, here we
-		 * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS
+		 * will allow the global_db_count >= tcmu_global_max_blocks
 		 * to reduce possible page fault call trace.
 		 */
 		atomic_inc(&global_db_count);
@@ -1396,7 +1444,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma)
 	vma->vm_private_data = udev;
 
 	/* Ensure the mmap is exactly the right size */
-	if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT))
+	if (vma_pages(vma) != (udev->ring_size >> PAGE_SHIFT))
 		return -EINVAL;
 
 	return 0;
@@ -1478,6 +1526,7 @@ static void tcmu_dev_kref_release(struct kref *kref)
 	WARN_ON(!all_expired);
 
 	tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1);
+	kfree(udev->data_bitmap);
 	mutex_unlock(&udev->cmdr_lock);
 
 	call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
@@ -1654,6 +1703,11 @@ static int tcmu_configure_device(struct se_device *dev)
 
 	info = &udev->uio_info;
 
+	udev->data_bitmap = kzalloc(BITS_TO_LONGS(udev->max_blocks) *
+				    sizeof(unsigned long), GFP_KERNEL);
+	if (!udev->data_bitmap)
+		goto err_bitmap_alloc;
+
 	udev->mb_addr = vzalloc(CMDR_SIZE);
 	if (!udev->mb_addr) {
 		ret = -ENOMEM;
@@ -1663,7 +1717,7 @@ static int tcmu_configure_device(struct se_device *dev)
 	/* mailbox fits in first part of CMDR space */
 	udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
 	udev->data_off = CMDR_SIZE;
-	udev->data_size = DATA_SIZE;
+	udev->data_size = udev->max_blocks * DATA_BLOCK_SIZE;
 	udev->dbi_thresh = 0; /* Default in Idle state */
 
 	/* Initialise the mailbox of the ring buffer */
@@ -1681,7 +1735,7 @@ static int tcmu_configure_device(struct se_device *dev)
 
 	info->mem[0].name = "tcm-user command & data buffer";
 	info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr;
-	info->mem[0].size = TCMU_RING_SIZE;
+	info->mem[0].size = udev->ring_size = udev->data_size + CMDR_SIZE;
 	info->mem[0].memtype = UIO_MEM_NONE;
 
 	info->irqcontrol = tcmu_irqcontrol;
@@ -1734,6 +1788,9 @@ static int tcmu_configure_device(struct se_device *dev)
 	vfree(udev->mb_addr);
 	udev->mb_addr = NULL;
 err_vzalloc:
+	kfree(udev->data_bitmap);
+	udev->data_bitmap = NULL;
+err_bitmap_alloc:
 	kfree(info->name);
 	info->name = NULL;
 
@@ -1774,7 +1831,7 @@ static void tcmu_destroy_device(struct se_device *dev)
 
 enum {
 	Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_hw_max_sectors,
-	Opt_nl_reply_supported, Opt_err,
+	Opt_nl_reply_supported, Opt_max_data_area_mb, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -1783,6 +1840,7 @@ static match_table_t tokens = {
 	{Opt_hw_block_size, "hw_block_size=%u"},
 	{Opt_hw_max_sectors, "hw_max_sectors=%u"},
 	{Opt_nl_reply_supported, "nl_reply_supported=%d"},
+	{Opt_max_data_area_mb, "max_data_area_mb=%u"},
 	{Opt_err, NULL}
 };
 
@@ -1816,7 +1874,7 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev,
 	struct tcmu_dev *udev = TCMU_DEV(dev);
 	char *orig, *ptr, *opts, *arg_p;
 	substring_t args[MAX_OPT_ARGS];
-	int ret = 0, token;
+	int ret = 0, token, tmpval;
 
 	opts = kstrdup(page, GFP_KERNEL);
 	if (!opts)
@@ -1868,6 +1926,39 @@ static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev,
 			if (ret < 0)
 				pr_err("kstrtoint() failed for nl_reply_supported=\n");
 			break;
+		case Opt_max_data_area_mb:
+			if (dev->export_count) {
+				pr_err("Unable to set max_data_area_mb while exports exist\n");
+				ret = -EINVAL;
+				break;
+			}
+
+			arg_p = match_strdup(&args[0]);
+			if (!arg_p) {
+				ret = -ENOMEM;
+				break;
+			}
+			ret = kstrtoint(arg_p, 0, &tmpval);
+			kfree(arg_p);
+			if (ret < 0) {
+				pr_err("kstrtoint() failed for max_data_area_mb=\n");
+				break;
+			}
+
+			if (tmpval <= 0) {
+				pr_err("Invalid max_data_area %d\n", tmpval);
+				ret = -EINVAL;
+				break;
+			}
+
+			udev->max_blocks = TCMU_MBS_TO_BLOCKS(tmpval);
+			if (udev->max_blocks > tcmu_global_max_blocks) {
+				pr_err("%d is too large. Adjusting max_data_area_mb to global limit of %u\n",
+				       tmpval,
+				       TCMU_BLOCKS_TO_MBS(tcmu_global_max_blocks));
+				udev->max_blocks = tcmu_global_max_blocks;
+			}
+			break;
 		default:
 			break;
 		}
@@ -1887,7 +1978,9 @@ static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b)
 
 	bl = sprintf(b + bl, "Config: %s ",
 		     udev->dev_config[0] ? udev->dev_config : "NULL");
-	bl += sprintf(b + bl, "Size: %zu\n", udev->dev_size);
+	bl += sprintf(b + bl, "Size: %zu ", udev->dev_size);
+	bl += sprintf(b + bl, "MaxDataAreaMB: %u\n",
+		      TCMU_BLOCKS_TO_MBS(udev->max_blocks));
 
 	return bl;
 }
@@ -1973,6 +2066,17 @@ static ssize_t tcmu_qfull_time_out_store(struct config_item *item,
 }
 CONFIGFS_ATTR(tcmu_, qfull_time_out);
 
+static ssize_t tcmu_max_data_area_mb_show(struct config_item *item, char *page)
+{
+	struct se_dev_attrib *da = container_of(to_config_group(item),
+						struct se_dev_attrib, da_group);
+	struct tcmu_dev *udev = TCMU_DEV(da->da_dev);
+
+	return snprintf(page, PAGE_SIZE, "%u\n",
+			TCMU_BLOCKS_TO_MBS(udev->max_blocks));
+}
+CONFIGFS_ATTR_RO(tcmu_, max_data_area_mb);
+
 static ssize_t tcmu_dev_config_show(struct config_item *item, char *page)
 {
 	struct se_dev_attrib *da = container_of(to_config_group(item),
@@ -2119,6 +2223,7 @@ CONFIGFS_ATTR(tcmu_, emulate_write_cache);
 static struct configfs_attribute *tcmu_attrib_attrs[] = {
 	&tcmu_attr_cmd_time_out,
 	&tcmu_attr_qfull_time_out,
+	&tcmu_attr_max_data_area_mb,
 	&tcmu_attr_dev_config,
 	&tcmu_attr_dev_size,
 	&tcmu_attr_emulate_write_cache,
@@ -2152,7 +2257,7 @@ static void find_free_blocks(void)
 	loff_t off;
 	u32 start, end, block, total_freed = 0;
 
-	if (atomic_read(&global_db_count) <= TCMU_GLOBAL_MAX_BLOCKS)
+	if (atomic_read(&global_db_count) <= tcmu_global_max_blocks)
 		return;
 
 	mutex_lock(&root_udev_mutex);
@@ -2200,7 +2305,7 @@ static void find_free_blocks(void)
 	}
 	mutex_unlock(&root_udev_mutex);
 
-	if (atomic_read(&global_db_count) > TCMU_GLOBAL_MAX_BLOCKS)
+	if (atomic_read(&global_db_count) > tcmu_global_max_blocks)
 		schedule_delayed_work(&tcmu_unmap_work, msecs_to_jiffies(5000));
 }
 

From e0f3d4c23e2158beaa403b343a75dfb5381a5ebe Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 10 Dec 2017 19:54:11 +0100
Subject: [PATCH 123/676] sbp-target: Delete an error message for a failed
 memory allocation in three functions

Omit an extra message for a memory allocation failure in these functions.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Acked-by: Chris Boot <bootc@boo.tc>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/sbp/sbp_target.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/target/sbp/sbp_target.c b/drivers/target/sbp/sbp_target.c
index e5c3e5f827d0b..fb1003921d85a 100644
--- a/drivers/target/sbp/sbp_target.c
+++ b/drivers/target/sbp/sbp_target.c
@@ -201,10 +201,9 @@ static struct sbp_session *sbp_session_create(
 	snprintf(guid_str, sizeof(guid_str), "%016llx", guid);
 
 	sess = kmalloc(sizeof(*sess), GFP_KERNEL);
-	if (!sess) {
-		pr_err("failed to allocate session descriptor\n");
+	if (!sess)
 		return ERR_PTR(-ENOMEM);
-	}
+
 	spin_lock_init(&sess->lock);
 	INIT_LIST_HEAD(&sess->login_list);
 	INIT_DELAYED_WORK(&sess->maint_work, session_maintenance_work);
@@ -2029,10 +2028,8 @@ static struct se_portal_group *sbp_make_tpg(
 	}
 
 	tpg = kzalloc(sizeof(*tpg), GFP_KERNEL);
-	if (!tpg) {
-		pr_err("Unable to allocate struct sbp_tpg\n");
+	if (!tpg)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	tpg->tport = tport;
 	tpg->tport_tpgt = tpgt;
@@ -2088,10 +2085,8 @@ static struct se_wwn *sbp_make_tport(
 		return ERR_PTR(-EINVAL);
 
 	tport = kzalloc(sizeof(*tport), GFP_KERNEL);
-	if (!tport) {
-		pr_err("Unable to allocate struct sbp_tport\n");
+	if (!tport)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	tport->guid = guid;
 	sbp_format_wwn(tport->tport_name, SBP_NAMELEN, guid);

From cd3fb32a72c17ff45fd671b12e8b5f018df527b4 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 10 Dec 2017 21:18:15 +0100
Subject: [PATCH 124/676] target: tcm_loop: Delete an error message for a
 failed memory allocation in four functions

Omit an extra message for a memory allocation failure in these functions.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index b6a913e38b301..6bd58a064924d 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -184,7 +184,6 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 
 	tl_cmd = kmem_cache_zalloc(tcm_loop_cmd_cache, GFP_ATOMIC);
 	if (!tl_cmd) {
-		pr_err("Unable to allocate struct tcm_loop_cmd\n");
 		set_host_byte(sc, DID_ERROR);
 		sc->scsi_done(sc);
 		return 0;
@@ -221,10 +220,8 @@ static int tcm_loop_issue_tmr(struct tcm_loop_tpg *tl_tpg,
 	}
 
 	tl_cmd = kmem_cache_zalloc(tcm_loop_cmd_cache, GFP_KERNEL);
-	if (!tl_cmd) {
-		pr_err("Unable to allocate memory for tl_cmd\n");
+	if (!tl_cmd)
 		return ret;
-	}
 
 	init_completion(&tl_cmd->tmr_done);
 
@@ -773,10 +770,8 @@ static int tcm_loop_make_nexus(
 	}
 
 	tl_nexus = kzalloc(sizeof(struct tcm_loop_nexus), GFP_KERNEL);
-	if (!tl_nexus) {
-		pr_err("Unable to allocate struct tcm_loop_nexus\n");
+	if (!tl_nexus)
 		return -ENOMEM;
-	}
 
 	tl_nexus->se_sess = target_alloc_session(&tl_tpg->tl_se_tpg, 0, 0,
 					TARGET_PROT_DIN_PASS | TARGET_PROT_DOUT_PASS,
@@ -1082,10 +1077,9 @@ static struct se_wwn *tcm_loop_make_scsi_hba(
 	int ret, off = 0;
 
 	tl_hba = kzalloc(sizeof(struct tcm_loop_hba), GFP_KERNEL);
-	if (!tl_hba) {
-		pr_err("Unable to allocate struct tcm_loop_hba\n");
+	if (!tl_hba)
 		return ERR_PTR(-ENOMEM);
-	}
+
 	/*
 	 * Determine the emulated Protocol Identifier and Target Port Name
 	 * based on the incoming configfs directory name.

From a572dba9e0a72a51d3a624cbb511c39a702c9cf3 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 10 Dec 2017 21:23:43 +0100
Subject: [PATCH 125/676] target: tcm_loop: Improve a size determination in two
 functions

Replace the specification of data structures by pointer dereferences
as the parameter for the operator "sizeof" to make the corresponding size
determination a bit safer according to the Linux coding style convention.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 6bd58a064924d..60e4185e75088 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -769,7 +769,7 @@ static int tcm_loop_make_nexus(
 		return -EEXIST;
 	}
 
-	tl_nexus = kzalloc(sizeof(struct tcm_loop_nexus), GFP_KERNEL);
+	tl_nexus = kzalloc(sizeof(*tl_nexus), GFP_KERNEL);
 	if (!tl_nexus)
 		return -ENOMEM;
 
@@ -1076,7 +1076,7 @@ static struct se_wwn *tcm_loop_make_scsi_hba(
 	char *ptr;
 	int ret, off = 0;
 
-	tl_hba = kzalloc(sizeof(struct tcm_loop_hba), GFP_KERNEL);
+	tl_hba = kzalloc(sizeof(*tl_hba), GFP_KERNEL);
 	if (!tl_hba)
 		return ERR_PTR(-ENOMEM);
 

From c8d1f4b76be3361b881bd88b0ebf16b306a4cab8 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 11 Dec 2017 10:40:23 +0100
Subject: [PATCH 126/676] target: tcm_loop: Combine substrings for 26 messages

The script "checkpatch.pl" pointed information out like the following.

WARNING: quoted string split across lines

Thus fix the affected source code places.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 120 +++++++++++++----------------
 1 file changed, 52 insertions(+), 68 deletions(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 60e4185e75088..d91109c1c4aee 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -123,8 +123,8 @@ static void tcm_loop_submission_work(struct work_struct *work)
 	}
 	tl_nexus = tl_tpg->tl_nexus;
 	if (!tl_nexus) {
-		scmd_printk(KERN_ERR, sc, "TCM_Loop I_T Nexus"
-				" does not exist\n");
+		scmd_printk(KERN_ERR, sc,
+			    "TCM_Loop I_T Nexus does not exist\n");
 		set_host_byte(sc, DID_ERROR);
 		goto out_done;
 	}
@@ -177,10 +177,10 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 {
 	struct tcm_loop_cmd *tl_cmd;
 
-	pr_debug("tcm_loop_queuecommand() %d:%d:%d:%llu got CDB: 0x%02x"
-		" scsi_buf_len: %u\n", sc->device->host->host_no,
-		sc->device->id, sc->device->channel, sc->device->lun,
-		sc->cmnd[0], scsi_bufflen(sc));
+	pr_debug("%s() %d:%d:%d:%llu got CDB: 0x%02x scsi_buf_len: %u\n",
+		 __func__, sc->device->host->host_no, sc->device->id,
+		 sc->device->channel, sc->device->lun, sc->cmnd[0],
+		 scsi_bufflen(sc));
 
 	tl_cmd = kmem_cache_zalloc(tcm_loop_cmd_cache, GFP_ATOMIC);
 	if (!tl_cmd) {
@@ -214,8 +214,7 @@ static int tcm_loop_issue_tmr(struct tcm_loop_tpg *tl_tpg,
 	 */
 	tl_nexus = tl_tpg->tl_nexus;
 	if (!tl_nexus) {
-		pr_err("Unable to perform device reset without"
-				" active I_T Nexus\n");
+		pr_err("Unable to perform device reset without active I_T Nexus\n");
 		return ret;
 	}
 
@@ -295,8 +294,7 @@ static int tcm_loop_target_reset(struct scsi_cmnd *sc)
 	 */
 	tl_hba = *(struct tcm_loop_hba **)shost_priv(sc->device->host);
 	if (!tl_hba) {
-		pr_err("Unable to perform device reset without"
-				" active I_T Nexus\n");
+		pr_err("Unable to perform device reset without active I_T Nexus\n");
 		return FAILED;
 	}
 	/*
@@ -414,8 +412,7 @@ static int tcm_loop_setup_hba_bus(struct tcm_loop_hba *tl_hba, int tcm_loop_host
 
 	ret = device_register(&tl_hba->dev);
 	if (ret) {
-		pr_err("device_register() failed for"
-				" tl_hba->dev: %d\n", ret);
+		pr_err("device_register() failed for tl_hba->dev: %d\n", ret);
 		return -ENODEV;
 	}
 
@@ -444,8 +441,7 @@ static int tcm_loop_alloc_core_bus(void)
 
 	ret = driver_register(&tcm_loop_driverfs);
 	if (ret) {
-		pr_err("driver_register() failed for"
-				"tcm_loop_driverfs\n");
+		pr_err("driver_register() failed for tcm_loop_driverfs\n");
 		goto bus_unreg;
 	}
 
@@ -584,8 +580,8 @@ static int tcm_loop_queue_data_in(struct se_cmd *se_cmd)
 				struct tcm_loop_cmd, tl_se_cmd);
 	struct scsi_cmnd *sc = tl_cmd->sc;
 
-	pr_debug("tcm_loop_queue_data_in() called for scsi_cmnd: %p"
-		     " cdb: 0x%02x\n", sc, sc->cmnd[0]);
+	pr_debug("%s() called for scsi_cmnd: %p cdb: 0x%02x\n",
+		 __func__, sc, sc->cmnd[0]);
 
 	sc->result = SAM_STAT_GOOD;
 	set_host_byte(sc, DID_OK);
@@ -602,8 +598,8 @@ static int tcm_loop_queue_status(struct se_cmd *se_cmd)
 				struct tcm_loop_cmd, tl_se_cmd);
 	struct scsi_cmnd *sc = tl_cmd->sc;
 
-	pr_debug("tcm_loop_queue_status() called for scsi_cmnd: %p"
-			" cdb: 0x%02x\n", sc, sc->cmnd[0]);
+	pr_debug("%s() called for scsi_cmnd: %p cdb: 0x%02x\n",
+		 __func__, sc, sc->cmnd[0]);
 
 	if (se_cmd->sense_buffer &&
 	   ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
@@ -688,8 +684,8 @@ static void tcm_loop_port_unlink(
 	sd = scsi_device_lookup(tl_hba->sh, 0, tl_tpg->tl_tpgt,
 				se_lun->unpacked_lun);
 	if (!sd) {
-		pr_err("Unable to locate struct scsi_device for %d:%d:"
-			"%llu\n", 0, tl_tpg->tl_tpgt, se_lun->unpacked_lun);
+		pr_err("Unable to locate struct scsi_device for %d:%d:%llu\n",
+		       0, tl_tpg->tl_tpgt, se_lun->unpacked_lun);
 		return;
 	}
 	/*
@@ -782,9 +778,8 @@ static int tcm_loop_make_nexus(
 		return ret;
 	}
 
-	pr_debug("TCM_Loop_ConfigFS: Established I_T Nexus to emulated"
-		" %s Initiator Port: %s\n", tcm_loop_dump_proto_id(tl_hba),
-		name);
+	pr_debug("TCM_Loop_ConfigFS: Established I_T Nexus to emulated %s Initiator Port: %s\n",
+		 tcm_loop_dump_proto_id(tl_hba), name);
 	return 0;
 }
 
@@ -803,15 +798,14 @@ static int tcm_loop_drop_nexus(
 		return -ENODEV;
 
 	if (atomic_read(&tpg->tl_tpg_port_count)) {
-		pr_err("Unable to remove TCM_Loop I_T Nexus with"
-			" active TPG port count: %d\n",
-			atomic_read(&tpg->tl_tpg_port_count));
+		pr_err("Unable to remove TCM_Loop I_T Nexus with active TPG port count: %d\n",
+		       atomic_read(&tpg->tl_tpg_port_count));
 		return -EPERM;
 	}
 
-	pr_debug("TCM_Loop_ConfigFS: Removing I_T Nexus to emulated"
-		" %s Initiator Port: %s\n", tcm_loop_dump_proto_id(tpg->tl_hba),
-		tl_nexus->se_sess->se_node_acl->initiatorname);
+	pr_debug("TCM_Loop_ConfigFS: Removing I_T Nexus to emulated %s Initiator Port: %s\n",
+		 tcm_loop_dump_proto_id(tpg->tl_hba),
+		 tl_nexus->se_sess->se_node_acl->initiatorname);
 	/*
 	 * Release the SCSI I_T Nexus to the emulated Target Port
 	 */
@@ -863,8 +857,8 @@ static ssize_t tcm_loop_tpg_nexus_store(struct config_item *item,
 	 * tcm_loop_make_nexus()
 	 */
 	if (strlen(page) >= TL_WWN_ADDR_LEN) {
-		pr_err("Emulated NAA Sas Address: %s, exceeds"
-				" max: %d\n", page, TL_WWN_ADDR_LEN);
+		pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n",
+		       page, TL_WWN_ADDR_LEN);
 		return -EINVAL;
 	}
 	snprintf(&i_port[0], TL_WWN_ADDR_LEN, "%s", page);
@@ -872,9 +866,8 @@ static ssize_t tcm_loop_tpg_nexus_store(struct config_item *item,
 	ptr = strstr(i_port, "naa.");
 	if (ptr) {
 		if (tl_hba->tl_proto_id != SCSI_PROTOCOL_SAS) {
-			pr_err("Passed SAS Initiator Port %s does not"
-				" match target port protoid: %s\n", i_port,
-				tcm_loop_dump_proto_id(tl_hba));
+			pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n",
+			       i_port, tcm_loop_dump_proto_id(tl_hba));
 			return -EINVAL;
 		}
 		port_ptr = &i_port[0];
@@ -883,9 +876,8 @@ static ssize_t tcm_loop_tpg_nexus_store(struct config_item *item,
 	ptr = strstr(i_port, "fc.");
 	if (ptr) {
 		if (tl_hba->tl_proto_id != SCSI_PROTOCOL_FCP) {
-			pr_err("Passed FCP Initiator Port %s does not"
-				" match target port protoid: %s\n", i_port,
-				tcm_loop_dump_proto_id(tl_hba));
+			pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n",
+			       i_port, tcm_loop_dump_proto_id(tl_hba));
 			return -EINVAL;
 		}
 		port_ptr = &i_port[3]; /* Skip over "fc." */
@@ -894,16 +886,15 @@ static ssize_t tcm_loop_tpg_nexus_store(struct config_item *item,
 	ptr = strstr(i_port, "iqn.");
 	if (ptr) {
 		if (tl_hba->tl_proto_id != SCSI_PROTOCOL_ISCSI) {
-			pr_err("Passed iSCSI Initiator Port %s does not"
-				" match target port protoid: %s\n", i_port,
-				tcm_loop_dump_proto_id(tl_hba));
+			pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n",
+			       i_port, tcm_loop_dump_proto_id(tl_hba));
 			return -EINVAL;
 		}
 		port_ptr = &i_port[0];
 		goto check_newline;
 	}
-	pr_err("Unable to locate prefix for emulated Initiator Port:"
-			" %s\n", i_port);
+	pr_err("Unable to locate prefix for emulated Initiator Port: %s\n",
+	       i_port);
 	return -EINVAL;
 	/*
 	 * Clear any trailing newline for the NAA WWN
@@ -1005,16 +996,15 @@ static struct se_portal_group *tcm_loop_make_naa_tpg(
 	unsigned long tpgt;
 
 	if (strstr(name, "tpgt_") != name) {
-		pr_err("Unable to locate \"tpgt_#\" directory"
-				" group\n");
+		pr_err("Unable to locate \"tpgt_#\" directory group\n");
 		return ERR_PTR(-EINVAL);
 	}
 	if (kstrtoul(name+5, 10, &tpgt))
 		return ERR_PTR(-EINVAL);
 
 	if (tpgt >= TL_TPGS_PER_HBA) {
-		pr_err("Passed tpgt: %lu exceeds TL_TPGS_PER_HBA:"
-				" %u\n", tpgt, TL_TPGS_PER_HBA);
+		pr_err("Passed tpgt: %lu exceeds TL_TPGS_PER_HBA: %u\n",
+		       tpgt, TL_TPGS_PER_HBA);
 		return ERR_PTR(-EINVAL);
 	}
 	tl_tpg = &tl_hba->tl_hba_tpgs[tpgt];
@@ -1027,10 +1017,9 @@ static struct se_portal_group *tcm_loop_make_naa_tpg(
 	if (ret < 0)
 		return ERR_PTR(-ENOMEM);
 
-	pr_debug("TCM_Loop_ConfigFS: Allocated Emulated %s"
-		" Target Port %s,t,0x%04lx\n", tcm_loop_dump_proto_id(tl_hba),
-		config_item_name(&wwn->wwn_group.cg_item), tpgt);
-
+	pr_debug("TCM_Loop_ConfigFS: Allocated Emulated %s Target Port %s,t,0x%04lx\n",
+		 tcm_loop_dump_proto_id(tl_hba),
+		 config_item_name(&wwn->wwn_group.cg_item), tpgt);
 	return &tl_tpg->tl_se_tpg;
 }
 
@@ -1057,9 +1046,9 @@ static void tcm_loop_drop_naa_tpg(
 	tl_tpg->tl_hba = NULL;
 	tl_tpg->tl_tpgt = 0;
 
-	pr_debug("TCM_Loop_ConfigFS: Deallocated Emulated %s"
-		" Target Port %s,t,0x%04x\n", tcm_loop_dump_proto_id(tl_hba),
-		config_item_name(&wwn->wwn_group.cg_item), tpgt);
+	pr_debug("TCM_Loop_ConfigFS: Deallocated Emulated %s Target Port %s,t,0x%04x\n",
+		 tcm_loop_dump_proto_id(tl_hba),
+		 config_item_name(&wwn->wwn_group.cg_item), tpgt);
 }
 
 /* End items for tcm_loop_naa_cit */
@@ -1097,8 +1086,8 @@ static struct se_wwn *tcm_loop_make_scsi_hba(
 	}
 	ptr = strstr(name, "iqn.");
 	if (!ptr) {
-		pr_err("Unable to locate prefix for emulated Target "
-				"Port: %s\n", name);
+		pr_err("Unable to locate prefix for emulated Target Port: %s\n",
+		       name);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1106,9 +1095,8 @@ static struct se_wwn *tcm_loop_make_scsi_hba(
 
 check_len:
 	if (strlen(name) >= TL_WWN_ADDR_LEN) {
-		pr_err("Emulated NAA %s Address: %s, exceeds"
-			" max: %d\n", name, tcm_loop_dump_proto_id(tl_hba),
-			TL_WWN_ADDR_LEN);
+		pr_err("Emulated NAA %s Address: %s, exceeds max: %d\n",
+		       name, tcm_loop_dump_proto_id(tl_hba), TL_WWN_ADDR_LEN);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1125,10 +1113,8 @@ static struct se_wwn *tcm_loop_make_scsi_hba(
 
 	sh = tl_hba->sh;
 	tcm_loop_hba_no_cnt++;
-	pr_debug("TCM_Loop_ConfigFS: Allocated emulated Target"
-		" %s Address: %s at Linux/SCSI Host ID: %d\n",
-		tcm_loop_dump_proto_id(tl_hba), name, sh->host_no);
-
+	pr_debug("TCM_Loop_ConfigFS: Allocated emulated Target %s Address: %s at Linux/SCSI Host ID: %d\n",
+		 tcm_loop_dump_proto_id(tl_hba), name, sh->host_no);
 	return &tl_hba->tl_hba_wwn;
 out:
 	kfree(tl_hba);
@@ -1141,10 +1127,9 @@ static void tcm_loop_drop_scsi_hba(
 	struct tcm_loop_hba *tl_hba = container_of(wwn,
 				struct tcm_loop_hba, tl_hba_wwn);
 
-	pr_debug("TCM_Loop_ConfigFS: Deallocating emulated Target"
-		" %s Address: %s at Linux/SCSI Host ID: %d\n",
-		tcm_loop_dump_proto_id(tl_hba), tl_hba->tl_wwn_address,
-		tl_hba->sh->host_no);
+	pr_debug("TCM_Loop_ConfigFS: Deallocating emulated Target %s Address: %s at Linux/SCSI Host ID: %d\n",
+		 tcm_loop_dump_proto_id(tl_hba), tl_hba->tl_wwn_address,
+		 tl_hba->sh->host_no);
 	/*
 	 * Call device_unregister() on the original tl_hba->dev.
 	 * tcm_loop_fabric_scsi.c:tcm_loop_release_adapter() will
@@ -1217,8 +1202,7 @@ static int __init tcm_loop_fabric_init(void)
 				__alignof__(struct tcm_loop_cmd),
 				0, NULL);
 	if (!tcm_loop_cmd_cache) {
-		pr_debug("kmem_cache_create() for"
-			" tcm_loop_cmd_cache failed\n");
+		pr_debug("kmem_cache_create() for tcm_loop_cmd_cache failed\n");
 		goto out_destroy_workqueue;
 	}
 

From 7deeceb4c7c53efeba54fbc7aab9e6b393f5d984 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 11 Dec 2017 10:56:42 +0100
Subject: [PATCH 127/676] target: tcm_loop: Delete two unnecessary variable
 initialisations in tcm_loop_issue_tmr()

The variables "se_cmd" and "tl_cmd" will eventually be set to appropriate
pointers a bit later.
Thus omit the explicit initialisation at the beginning.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index d91109c1c4aee..08dbf5b4ff15e 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -203,10 +203,10 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
 static int tcm_loop_issue_tmr(struct tcm_loop_tpg *tl_tpg,
 			      u64 lun, int task, enum tcm_tmreq_table tmr)
 {
-	struct se_cmd *se_cmd = NULL;
+	struct se_cmd *se_cmd;
 	struct se_session *se_sess;
 	struct tcm_loop_nexus *tl_nexus;
-	struct tcm_loop_cmd *tl_cmd = NULL;
+	struct tcm_loop_cmd *tl_cmd;
 	int ret = TMR_FUNCTION_FAILED, rc;
 
 	/*

From 2dfe3511ce8c15f25c121f30353dcf02b074ef15 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 11 Dec 2017 10:58:33 +0100
Subject: [PATCH 128/676] target: tcm_loop: Delete an unnecessary return
 statement in tcm_loop_submission_work()

The script "checkpatch.pl" pointed information out like the following.

WARNING: void function return statements are not generally useful

Thus remove such a statement in the affected function.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index 08dbf5b4ff15e..ec01070bbb40e 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -166,7 +166,6 @@ static void tcm_loop_submission_work(struct work_struct *work)
 out_done:
 	kmem_cache_free(tcm_loop_cmd_cache, tl_cmd);
 	sc->scsi_done(sc);
-	return;
 }
 
 /*

From 09f99a3dfadce47ba868cee318d55d249eb4e413 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Mon, 11 Dec 2017 11:01:57 +0100
Subject: [PATCH 129/676] target: tcm_loop: Use seq_puts() in
 tcm_loop_show_info()

The script "checkpatch.pl" pointed information out like the following.

WARNING: Prefer seq_puts to seq_printf

Thus fix the affected source code place.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/loopback/tcm_loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c
index ec01070bbb40e..9cd4ffe76c07f 100644
--- a/drivers/target/loopback/tcm_loop.c
+++ b/drivers/target/loopback/tcm_loop.c
@@ -64,7 +64,7 @@ static void tcm_loop_release_cmd(struct se_cmd *se_cmd)
 
 static int tcm_loop_show_info(struct seq_file *m, struct Scsi_Host *host)
 {
-	seq_printf(m, "tcm_loop_proc_info()\n");
+	seq_puts(m, "tcm_loop_proc_info()\n");
 	return 0;
 }
 

From 093ec1430f41063d65dfc68d7eddec9d80f8efbb Mon Sep 17 00:00:00 2001
From: Varun Prakash <varun@chelsio.com>
Date: Mon, 11 Dec 2017 21:00:25 +0530
Subject: [PATCH 130/676] cxgbit: call neigh_event_send() to update MAC address

If nud_state is not valid then call neigh_event_send() to update MAC
address.

Signed-off-by: Varun Prakash <varun@chelsio.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/iscsi/cxgbit/cxgbit_cm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/target/iscsi/cxgbit/cxgbit_cm.c b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
index 92eb57e2adaf5..8de16016b6de9 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_cm.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
@@ -893,6 +893,9 @@ cxgbit_offload_init(struct cxgbit_sock *csk, int iptype, __u8 *peer_ip,
 		return -ENODEV;
 
 	rcu_read_lock();
+	if (!(n->nud_state & NUD_VALID))
+		neigh_event_send(n, NULL);
+
 	ret = -ENOMEM;
 	if (n->dev->flags & IFF_LOOPBACK) {
 		if (iptype == 4)

From ce512d79d0466a604793addb6b769d12ee326822 Mon Sep 17 00:00:00 2001
From: David Disseldorp <ddiss@suse.de>
Date: Wed, 13 Dec 2017 18:22:30 +0100
Subject: [PATCH 131/676] target/iscsi: avoid NULL dereference in CHAP auth
 error path

If chap_server_compute_md5() fails early, e.g. via CHAP_N mismatch, then
crypto_free_shash() is called with a NULL pointer which gets
dereferenced in crypto_shash_tfm().

Fixes: 69110e3cedbb ("iscsi-target: Use shash and ahash")
Suggested-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: David Disseldorp <ddiss@suse.de>
Cc: stable@vger.kernel.org # 4.6+
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/iscsi/iscsi_target_auth.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c
index f9bc8ec6fb6b5..9518ffd8b8bac 100644
--- a/drivers/target/iscsi/iscsi_target_auth.c
+++ b/drivers/target/iscsi/iscsi_target_auth.c
@@ -421,7 +421,8 @@ static int chap_server_compute_md5(
 	auth_ret = 0;
 out:
 	kzfree(desc);
-	crypto_free_shash(tfm);
+	if (tfm)
+		crypto_free_shash(tfm);
 	kfree(challenge);
 	kfree(challenge_binhex);
 	return auth_ret;

From 9960f85181dd08cda03fddcf0bc8d81190bec4eb Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Wed, 13 Dec 2017 13:55:13 -0800
Subject: [PATCH 132/676] target: don't call an unmap callback if a range
 length is zero

If a length of a range is zero, it means there is nothing to unmap
and we can skip this range.

Here is one more reason, why we have to skip such ranges.  An unmap
callback calls file_operations->fallocate(), but the man page for the
fallocate syscall says that fallocate(fd, mode, offset, let) returns
EINVAL, if len is zero. It means that file_operations->fallocate() isn't
obligated to handle zero ranges too.

Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_sbc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 750a04ed0e93a..b054682e974f9 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -1216,9 +1216,11 @@ sbc_execute_unmap(struct se_cmd *cmd)
 			goto err;
 		}
 
-		ret = ops->execute_unmap(cmd, lba, range);
-		if (ret)
-			goto err;
+		if (range) {
+			ret = ops->execute_unmap(cmd, lba, range);
+			if (ret)
+				goto err;
+		}
 
 		ptr += 16;
 		size -= 16;

From c1c390ba53195aef36e94b2354bc0e603057c293 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 19 Dec 2017 04:03:54 -0600
Subject: [PATCH 133/676] tcmu: prevent corruption when invalid data page
 requested

We will always have a page mapped for cmd data if it is
valid command. If the mapping does not exist then something
bad happened in userspace and it should not proceed. This
has us return VM_FAULT_SIGBUS when this happens instead of
returning a freshly allocated paged. The latter can cause
corruption because userspace might write the pages data
overwriting valid data or return it to the initiator.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 43 +++++--------------------------
 1 file changed, 6 insertions(+), 37 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index bac08bc72e3b9..a746fedbb4869 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1342,7 +1342,6 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
 static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
 {
 	struct page *page;
-	int ret;
 
 	mutex_lock(&udev->cmdr_lock);
 	page = tcmu_get_block_page(udev, dbi);
@@ -1352,42 +1351,12 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
 	}
 
 	/*
-	 * Normally it shouldn't be here:
-	 * Only when the userspace has touched the blocks which
-	 * are out of the tcmu_cmd's data iov[], and will return
-	 * one zeroed page.
+	 * Userspace messed up and passed in a address not in the
+	 * data iov passed to it.
 	 */
-	pr_warn("Block(%u) out of cmd's iov[] has been touched!\n", dbi);
-	pr_warn("Mostly it will be a bug of userspace, please have a check!\n");
-
-	if (dbi >= udev->dbi_thresh) {
-		/* Extern the udev->dbi_thresh to dbi + 1 */
-		udev->dbi_thresh = dbi + 1;
-		udev->dbi_max = dbi;
-	}
-
-	page = radix_tree_lookup(&udev->data_blocks, dbi);
-	if (!page) {
-		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-		if (!page) {
-			mutex_unlock(&udev->cmdr_lock);
-			return NULL;
-		}
-
-		ret = radix_tree_insert(&udev->data_blocks, dbi, page);
-		if (ret) {
-			mutex_unlock(&udev->cmdr_lock);
-			__free_page(page);
-			return NULL;
-		}
-
-		/*
-		 * Since this case is rare in page fault routine, here we
-		 * will allow the global_db_count >= tcmu_global_max_blocks
-		 * to reduce possible page fault call trace.
-		 */
-		atomic_inc(&global_db_count);
-	}
+	pr_err("Invalid addr to data block mapping  (dbi %u) on device %s\n",
+	       dbi, udev->name);
+	page = NULL;
 	mutex_unlock(&udev->cmdr_lock);
 
 	return page;
@@ -1422,7 +1391,7 @@ static int tcmu_vma_fault(struct vm_fault *vmf)
 		dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE;
 		page = tcmu_try_get_block_page(udev, dbi);
 		if (!page)
-			return VM_FAULT_NOPAGE;
+			return VM_FAULT_SIGBUS;
 	}
 
 	get_page(page);

From d120c7083f49d177e6765afad543faa0f243590e Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 19 Dec 2017 04:03:55 -0600
Subject: [PATCH 134/676] target: add SAM_STAT_BUSY sense reason

Add SAM_STAT_BUSY sense_reason. The next patch will have
target_core_user return this value while it is temporarily
blocked and restarting.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_transport.c | 3 +++
 include/target/target_core_base.h      | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 58caacd54a3b2..74b646f165d4e 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1774,6 +1774,9 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 	case TCM_OUT_OF_RESOURCES:
 		cmd->scsi_status = SAM_STAT_TASK_SET_FULL;
 		goto queue_status;
+	case TCM_LUN_BUSY:
+		cmd->scsi_status = SAM_STAT_BUSY;
+		goto queue_status;
 	case TCM_RESERVATION_CONFLICT:
 		/*
 		 * No SENSE Data payload for this case, set SCSI Status
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 2c8d8115469dc..00482f903decb 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -183,6 +183,7 @@ enum tcm_sense_reason_table {
 	TCM_TOO_MANY_SEGMENT_DESCS		= R(0x1b),
 	TCM_UNSUPPORTED_SEGMENT_DESC_TYPE_CODE	= R(0x1c),
 	TCM_INSUFFICIENT_REGISTRATION_RESOURCES	= R(0x1d),
+	TCM_LUN_BUSY				= R(0x1e),
 #undef R
 };
 

From 88cf107325fc7e37925de1960bd34ab917924362 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 19 Dec 2017 04:03:56 -0600
Subject: [PATCH 135/676] target_core_user: add cmd id to broken ring message

Log cmd id that was not found in the tcmu_handle_completions
lookup failure path.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index a746fedbb4869..1238480cd4c4f 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1098,7 +1098,8 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 
 		cmd = idr_remove(&udev->commands, entry->hdr.cmd_id);
 		if (!cmd) {
-			pr_err("cmd_id not found, ring is broken\n");
+			pr_err("cmd_id %u not found, ring is broken\n",
+			       entry->hdr.cmd_id);
 			set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
 			break;
 		}

From a24e7917e1758daf9ac0da4fac0f7b48f0b4b624 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 11 Jan 2018 11:12:25 +0000
Subject: [PATCH 136/676] tcmu: fix error return code in
 tcmu_configure_device()

Fix to return error code -ENOMEM from the kzalloc() error handling
case instead of 0, as done elsewhere in this function.

Fixes: 80eb876 ("tcmu: allow max block and global max blocks to be settable")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 1238480cd4c4f..60c8a87b7a882 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1675,8 +1675,10 @@ static int tcmu_configure_device(struct se_device *dev)
 
 	udev->data_bitmap = kzalloc(BITS_TO_LONGS(udev->max_blocks) *
 				    sizeof(unsigned long), GFP_KERNEL);
-	if (!udev->data_bitmap)
+	if (!udev->data_bitmap) {
+		ret = -ENOMEM;
 		goto err_bitmap_alloc;
+	}
 
 	udev->mb_addr = vzalloc(CMDR_SIZE);
 	if (!udev->mb_addr) {

From 5855564c8ab2d9cefca7b2933bd19818eb795e40 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 12 Jan 2018 20:55:20 +1100
Subject: [PATCH 137/676] KVM: PPC: Book3S HV: Enable migration of decrementer
 register

This adds a register identifier for use with the one_reg interface
to allow the decrementer expiry time to be read and written by
userspace.  The decrementer expiry time is in guest timebase units
and is equal to the sum of the decrementer and the guest timebase.
(The expiry time is used rather than the decrementer value itself
because the expiry time is not constantly changing, though the
decrementer value is, while the guest vcpu is not running.)

Without this, a guest vcpu migrated to a new host will see its
decrementer set to some random value.  On POWER8 and earlier, the
decrementer is 32 bits wide and counts down at 512MHz, so the
guest vcpu will potentially see no decrementer interrupts for up
to about 4 seconds, which will lead to a stall.  With POWER9, the
decrementer is now 56 bits side, so the stall can be much longer
(up to 2.23 years) and more noticeable.

To help work around the problem in cases where userspace has not been
updated to migrate the decrementer expiry time, we now set the
default decrementer expiry at vcpu creation time to the current time
rather than the maximum possible value.  This should mean an
immediate decrementer interrupt when a migrated vcpu starts
running.  In cases where the decrementer is 32 bits wide and more
than 4 seconds elapse between the creation of the vcpu and when it
first runs, the decrementer would have wrapped around to positive
values and there may still be a stall - but this is no worse than
the current situation.  In the large-decrementer case, we are sure
to get an immediate decrementer interrupt (assuming the time from
vcpu creation to first run is less than 2.23 years) and we thus
avoid a very long stall.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt   | 1 +
 arch/powerpc/include/uapi/asm/kvm.h | 2 ++
 arch/powerpc/kvm/book3s_hv.c        | 8 ++++++++
 arch/powerpc/kvm/powerpc.c          | 2 +-
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f33..c6f9eebb79f2c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1841,6 +1841,7 @@ registers, find a list below:
   PPC	| KVM_REG_PPC_DBSR              | 32
   PPC   | KVM_REG_PPC_TIDR              | 64
   PPC   | KVM_REG_PPC_PSSCR             | 64
+  PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
   PPC   | KVM_REG_PPC_TM_GPR0           | 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31          | 64
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 61d6049f4c1ec..8aaec831053af 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -607,6 +607,8 @@ struct kvm_ppc_rmmu_info {
 #define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
 #define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
 
+#define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c4f0bebfc5ba4..b2d448c750086 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1497,6 +1497,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_ARCH_COMPAT:
 		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
 		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		*val = get_reg_val(id, vcpu->arch.dec_expires +
+				   vcpu->arch.vcore->tb_offset);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1724,6 +1728,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_ARCH_COMPAT:
 		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
 		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		vcpu->arch.dec_expires = set_reg_val(id, *val) -
+			vcpu->arch.vcore->tb_offset;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c2c7ef3305532..7c9e45f54186d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -758,7 +758,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
-	vcpu->arch.dec_expires = ~(u64)0;
+	vcpu->arch.dec_expires = get_tb();
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);

From 9696594158ec43d210137c1b93313efde8f7315a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 10 Nov 2017 16:18:05 +0100
Subject: [PATCH 138/676] s390x/mm: cleanup gmap_pte_op_walk()

gmap_mprotect_notify() refuses shadow gmaps. Turns out that
a) gmap_protect_range()
b) gmap_read_table()
c) gmap_pte_op_walk()

Are never called for gmap shadows. And never should be. This dates back
to gmap shadow prototypes where we allowed to call mprotect_notify() on
the gmap shadow (to get notified about the prefix pages getting removed).
This is avoided by always getting notified about any change on the gmap
shadow.

The only real function for walking page tables on shadow gmaps is
gmap_table_walk().

So, essentially, these functions should never get called and
gmap_pte_op_walk() can be cleaned up. Add some checks to callers of
gmap_pte_op_walk().

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20171110151805.7541-1-david@redhat.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 05d459b638f55..54cfd51a5a27e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -815,27 +815,17 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
  * @ptl: pointer to the spinlock pointer
  *
  * Returns a pointer to the locked pte for a guest address, or NULL
- *
- * Note: Can also be called for shadow gmaps.
  */
 static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 			       spinlock_t **ptl)
 {
 	unsigned long *table;
 
-	if (gmap_is_shadow(gmap))
-		spin_lock(&gmap->guest_table_lock);
+	BUG_ON(gmap_is_shadow(gmap));
 	/* Walk the gmap page table, lock and get pte pointer */
 	table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
-	if (!table || *table & _SEGMENT_ENTRY_INVALID) {
-		if (gmap_is_shadow(gmap))
-			spin_unlock(&gmap->guest_table_lock);
+	if (!table || *table & _SEGMENT_ENTRY_INVALID)
 		return NULL;
-	}
-	if (gmap_is_shadow(gmap)) {
-		*ptl = &gmap->guest_table_lock;
-		return pte_offset_map((pmd_t *) table, gaddr);
-	}
 	return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 }
 
@@ -889,8 +879,6 @@ static void gmap_pte_op_end(spinlock_t *ptl)
  * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
  *
  * Called with sg->mm->mmap_sem in read.
- *
- * Note: Can also be called for shadow gmaps.
  */
 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 			      unsigned long len, int prot, unsigned long bits)
@@ -900,6 +888,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 	pte_t *ptep;
 	int rc;
 
+	BUG_ON(gmap_is_shadow(gmap));
 	while (len) {
 		rc = -EAGAIN;
 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
@@ -960,7 +949,8 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
  * @val: pointer to the unsigned long value to return
  *
  * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
- * if reading using the virtual address failed.
+ * if reading using the virtual address failed. -EINVAL if called on a gmap
+ * shadow.
  *
  * Called with gmap->mm->mmap_sem in read.
  */
@@ -971,6 +961,9 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 	pte_t *ptep, pte;
 	int rc;
 
+	if (gmap_is_shadow(gmap))
+		return -EINVAL;
+
 	while (1) {
 		rc = -EAGAIN;
 		ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);

From 241e3ec0faf5ab1a0d9b1f6c43eefa919fb9c112 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 16 Nov 2017 15:12:52 +0100
Subject: [PATCH 139/676] KVM: s390: use created_vcpus in more places

commit a03825bbd0c3 ("KVM: s390: use kvm->created_vcpus") introduced
kvm->created_vcpus to avoid races with the existing kvm->online_vcpus
scheme. One place was "forgotten" and one new place was "added".
Let's fix those.

Reported-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Fixes: 4e0b1ab72b8a ("KVM: s390: gs support for kvm guests")
Fixes: a03825bbd0c3 ("KVM: s390: use kvm->created_vcpus")
---
 arch/s390/kvm/kvm-s390.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ec8b68e97d3cd..f579bf5c26203 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -573,7 +573,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 	case KVM_CAP_S390_GS:
 		r = -EINVAL;
 		mutex_lock(&kvm->lock);
-		if (atomic_read(&kvm->online_vcpus)) {
+		if (kvm->created_vcpus) {
 			r = -EBUSY;
 		} else if (test_facility(133)) {
 			set_kvm_facility(kvm->arch.model.fac_mask, 133);
@@ -1094,7 +1094,7 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
-	if (!atomic_read(&kvm->online_vcpus)) {
+	if (!kvm->created_vcpus) {
 		bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
 			    KVM_S390_VM_CPU_FEAT_NR_BITS);
 		ret = 0;

From 2f8311c912ab67084ce3657096df601c87f49a58 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 16 Nov 2017 12:30:15 +0100
Subject: [PATCH 140/676] KVM: s390: add debug tracing for cpu features of CPU
 model

The cpu model already traces the cpu facilities, the ibc and
guest CPU ids. We should do the same for the cpu features (on
success only).

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f579bf5c26203..d9f94bf241487 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1084,7 +1084,6 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
 				       struct kvm_device_attr *attr)
 {
 	struct kvm_s390_vm_cpu_feat data;
-	int ret = -EBUSY;
 
 	if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
 		return -EFAULT;
@@ -1094,13 +1093,18 @@ static int kvm_s390_set_processor_feat(struct kvm *kvm,
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
-	if (!kvm->created_vcpus) {
-		bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
-			    KVM_S390_VM_CPU_FEAT_NR_BITS);
-		ret = 0;
+	if (kvm->created_vcpus) {
+		mutex_unlock(&kvm->lock);
+		return -EBUSY;
 	}
+	bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+		    KVM_S390_VM_CPU_FEAT_NR_BITS);
 	mutex_unlock(&kvm->lock);
-	return ret;
+	VM_EVENT(kvm, 3, "SET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
+			 data.feat[0],
+			 data.feat[1],
+			 data.feat[2]);
+	return 0;
 }
 
 static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
@@ -1202,6 +1206,10 @@ static int kvm_s390_get_processor_feat(struct kvm *kvm,
 		    KVM_S390_VM_CPU_FEAT_NR_BITS);
 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
 		return -EFAULT;
+	VM_EVENT(kvm, 3, "GET: guest feat: 0x%16.16llx.0x%16.16llx.0x%16.16llx",
+			 data.feat[0],
+			 data.feat[1],
+			 data.feat[2]);
 	return 0;
 }
 
@@ -1215,6 +1223,10 @@ static int kvm_s390_get_machine_feat(struct kvm *kvm,
 		    KVM_S390_VM_CPU_FEAT_NR_BITS);
 	if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
 		return -EFAULT;
+	VM_EVENT(kvm, 3, "GET: host feat:  0x%16.16llx.0x%16.16llx.0x%16.16llx",
+			 data.feat[0],
+			 data.feat[1],
+			 data.feat[2]);
 	return 0;
 }
 

From 588629385c4b96c436cef55dff75eef7f90cd5f0 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Mon, 20 Nov 2017 10:37:30 +0100
Subject: [PATCH 141/676] KVM: s390: drop use of spin lock in
 __floating_irq_kick

It is not required to take to a lock to protect access to the cpuflags
of the local interrupt structure of a vcpu as the performed operation
is an atomic_or.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 024ad8bcc5165..818aa4248b0f3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1569,7 +1569,6 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 
 	/* make the VCPU drop out of the SIE, or wake it up if sleeping */
 	li = &dst_vcpu->arch.local_int;
-	spin_lock(&li->lock);
 	switch (type) {
 	case KVM_S390_MCHK:
 		atomic_or(CPUSTAT_STOP_INT, li->cpuflags);
@@ -1581,7 +1580,6 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 		atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
 		break;
 	}
-	spin_unlock(&li->lock);
 	kvm_s390_vcpu_wakeup(dst_vcpu);
 }
 

From 1a5c79125afdc16e999c1700b0d3dfef1bef1c84 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 12 Dec 2017 09:29:09 +0100
Subject: [PATCH 142/676] kvm_config: add CONFIG_S390_GUEST

make kvmconfig currently does not select CONFIG_S390_GUEST. Since
the virtio-ccw transport depends on CONFIG_S390_GUEST, we want
to add CONFIG_S390_GUEST to kvmconfig.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
---
 kernel/configs/kvm_guest.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config
index 8d96437671429..108fecc20fc14 100644
--- a/kernel/configs/kvm_guest.config
+++ b/kernel/configs/kvm_guest.config
@@ -18,6 +18,7 @@ CONFIG_VIRTUALIZATION=y
 CONFIG_HYPERVISOR_GUEST=y
 CONFIG_PARAVIRT=y
 CONFIG_KVM_GUEST=y
+CONFIG_S390_GUEST=y
 CONFIG_VIRTIO=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BLK=y

From a9f6c9a92f3771000493f6bbacbd7677b46d8706 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 8 Jan 2018 20:37:47 +0100
Subject: [PATCH 143/676] KVM: s390: cleanup struct kvm_s390_float_interrupt

"wq" is not used at all. "cpuflags" can be access directly via the vcpu,
just as "float_int" via vcpu->kvm.
While at it, reuse _set_cpuflag() to make the code look nicer.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180108193747.10818-1-david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  3 ---
 arch/s390/kvm/interrupt.c        | 25 +++++++++++--------------
 arch/s390/kvm/kvm-s390.c         |  3 ---
 arch/s390/kvm/kvm-s390.h         |  2 +-
 arch/s390/kvm/sigp.c             | 12 ++++--------
 5 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index e14f381757f67..e16a9f2a44ad6 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -515,9 +515,6 @@ struct kvm_s390_irq_payload {
 
 struct kvm_s390_local_interrupt {
 	spinlock_t lock;
-	struct kvm_s390_float_interrupt *float_int;
-	struct swait_queue_head *wq;
-	atomic_t *cpuflags;
 	DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
 	struct kvm_s390_irq_payload irq;
 	unsigned long pending_irqs;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 818aa4248b0f3..f8eb2cfa763ac 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -107,12 +107,11 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 
 static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
 {
-	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	int rc, expect;
 
 	if (!kvm_s390_use_sca_entries())
 		return;
-	atomic_andnot(CPUSTAT_ECALL_PEND, li->cpuflags);
+	atomic_andnot(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
 	read_lock(&vcpu->kvm->arch.sca_lock);
 	if (vcpu->kvm->arch.use_esca) {
 		struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -279,13 +278,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
-	set_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+	set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
 	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
-	clear_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
@@ -1228,7 +1227,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 
 	li->irq.ext = irq->u.ext;
 	set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs);
-	atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1253,7 +1252,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
 		return -EBUSY;
 	*extcall = irq->u.extcall;
-	atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1329,7 +1328,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
 
 	set_bit(irq->u.emerg.code, li->sigp_emerg_pending);
 	set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs);
-	atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1373,7 +1372,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu)
 				   0, 0);
 
 	set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
-	atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1386,7 +1385,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu)
 				   0, 0);
 
 	set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
-	atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1546,7 +1545,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 static void __floating_irq_kick(struct kvm *kvm, u64 type)
 {
 	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
-	struct kvm_s390_local_interrupt *li;
 	struct kvm_vcpu *dst_vcpu;
 	int sigcpu, online_vcpus, nr_tries = 0;
 
@@ -1568,16 +1566,15 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 	dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
 
 	/* make the VCPU drop out of the SIE, or wake it up if sleeping */
-	li = &dst_vcpu->arch.local_int;
 	switch (type) {
 	case KVM_S390_MCHK:
-		atomic_or(CPUSTAT_STOP_INT, li->cpuflags);
+		__set_cpuflag(dst_vcpu, CPUSTAT_STOP_INT);
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		atomic_or(CPUSTAT_IO_INT, li->cpuflags);
+		__set_cpuflag(dst_vcpu, CPUSTAT_IO_INT);
 		break;
 	default:
-		atomic_or(CPUSTAT_EXT_INT, li->cpuflags);
+		__set_cpuflag(dst_vcpu, CPUSTAT_EXT_INT);
 		break;
 	}
 	kvm_s390_vcpu_wakeup(dst_vcpu);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d9f94bf241487..cc5a1b4d74b7d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2509,9 +2509,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 	vcpu->arch.sie_block->icpua = id;
 	spin_lock_init(&vcpu->arch.local_int.lock);
-	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-	vcpu->arch.local_int.wq = &vcpu->wq;
-	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
 	seqcount_init(&vcpu->arch.cputm_seqcount);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 5e46ba429bcb4..8877116f01595 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -54,7 +54,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 {
-	return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
+	return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static inline int kvm_is_ucontrol(struct kvm *kvm)
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index c1f5cde2c878e..5cafd1e2651bc 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -20,14 +20,11 @@
 static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
 			u64 *reg)
 {
-	struct kvm_s390_local_interrupt *li;
 	int cpuflags;
 	int rc;
 	int ext_call_pending;
 
-	li = &dst_vcpu->arch.local_int;
-
-	cpuflags = atomic_read(li->cpuflags);
+	cpuflags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
 	ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu);
 	if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending)
 		rc = SIGP_CC_ORDER_CODE_ACCEPTED;
@@ -211,7 +208,7 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
 	int flags;
 	int rc;
 
-	flags = atomic_read(dst_vcpu->arch.local_int.cpuflags);
+	flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
 	if (!(flags & CPUSTAT_STOPPED)) {
 		*reg &= 0xffffffff00000000UL;
 		*reg |= SIGP_STATUS_INCORRECT_STATE;
@@ -231,7 +228,6 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
 static int __sigp_sense_running(struct kvm_vcpu *vcpu,
 				struct kvm_vcpu *dst_vcpu, u64 *reg)
 {
-	struct kvm_s390_local_interrupt *li;
 	int rc;
 
 	if (!test_kvm_facility(vcpu->kvm, 9)) {
@@ -240,8 +236,8 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu,
 		return SIGP_CC_STATUS_STORED;
 	}
 
-	li = &dst_vcpu->arch.local_int;
-	if (atomic_read(li->cpuflags) & CPUSTAT_RUNNING) {
+	if (atomic_read(&dst_vcpu->arch.sie_block->cpuflags) &
+	    CPUSTAT_RUNNING) {
 		/* running */
 		rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 	} else {

From fa55eedd6328d3072e82218a2346b8752253af2d Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 12 Dec 2017 17:33:01 -0800
Subject: [PATCH 144/676] KVM: X86: Add KVM_VCPU_PREEMPTED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The next patch will add another bit to the preempted field in
kvm_steal_time.  Define a constant for bit 0 (the only one that is
currently used).

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/uapi/asm/kvm_para.h | 2 ++
 arch/x86/kernel/kvm.c                | 2 +-
 arch/x86/kvm/x86.c                   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 09cc06483bed4..15685bd22faa4 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -51,6 +51,8 @@ struct kvm_steal_time {
 	__u32 pad[11];
 };
 
+#define KVM_VCPU_PREEMPTED          (1 << 0)
+
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
 	__s64 sec;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b40ffbf156c18..6610b92fc6a59 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -643,7 +643,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 
-	return !!src->preempted;
+	return !!(src->preempted & KVM_VCPU_PREEMPTED);
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fafaef072f1b5..897f4795513fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2916,7 +2916,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	vcpu->arch.st.steal.preempted = 1;
+	vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
 
 	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
 			&vcpu->arch.st.steal.preempted,

From 858a43aae23672d46fe802a41f4748f322965182 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 12 Dec 2017 17:33:02 -0800
Subject: [PATCH 145/676] KVM: X86: use paravirtualized TLB Shootdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remote TLB flush does a busy wait which is fine in bare-metal
scenario. But with-in the guest, the vcpus might have been pre-empted or
blocked. In this scenario, the initator vcpu would end up busy-waiting
for a long amount of time; it also consumes CPU unnecessarily to wake
up the target of the shootdown.

This patch set adds support for KVM's new paravirtualized TLB flush;
remote TLB flush does not wait for vcpus that are sleeping, instead
KVM will flush the TLB as soon as the vCPU starts running again.

The improvement is clearly visible when the host is overcommitted; in this
case, the PV TLB flush (in addition to avoiding the wait on the main CPU)
prevents preempted vCPUs from stealing precious execution time from the
running ones.

Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads,
so 64 pCPUs, and each VM is 64 vCPUs.

ebizzy -M
              vanilla    optimized     boost
1VM            46799       48670         4%
2VM            23962       42691        78%
3VM            16152       37539       132%

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 Documentation/virtual/kvm/cpuid.txt  |  4 +++
 arch/x86/include/uapi/asm/kvm_para.h |  2 ++
 arch/x86/kernel/kvm.c                | 47 ++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 3c65feb830101..dcab6dc11e3b0 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT              ||     7 || guest checks this feature bit
                                    ||       || before enabling paravirtualized
                                    ||       || spinlock support.
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || tlb flush.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 15685bd22faa4..7a2ade4aa2353 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -25,6 +25,7 @@
 #define KVM_FEATURE_STEAL_TIME		5
 #define KVM_FEATURE_PV_EOI		6
 #define KVM_FEATURE_PV_UNHALT		7
+#define KVM_FEATURE_PV_TLB_FLUSH	9
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -52,6 +53,7 @@ struct kvm_steal_time {
 };
 
 #define KVM_VCPU_PREEMPTED          (1 << 0)
+#define KVM_VCPU_FLUSH_TLB          (1 << 1)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 6610b92fc6a59..4e37d1a851a62 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void)
 	update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+			const struct flush_tlb_info *info)
+{
+	u8 state;
+	int cpu;
+	struct kvm_steal_time *src;
+	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+	cpumask_copy(flushmask, cpumask);
+	/*
+	 * We have to call flush only on online vCPUs. And
+	 * queue flush_on_enter for pre-empted vCPUs
+	 */
+	for_each_cpu(cpu, flushmask) {
+		src = &per_cpu(steal_time, cpu);
+		state = READ_ONCE(src->preempted);
+		if ((state & KVM_VCPU_PREEMPTED)) {
+			if (try_cmpxchg(&src->preempted, &state,
+					state | KVM_VCPU_FLUSH_TLB))
+				__cpumask_clear_cpu(cpu, flushmask);
+		}
+	}
+
+	native_flush_tlb_others(flushmask, info);
+}
+
 static void __init kvm_guest_init(void)
 {
 	int i;
@@ -517,6 +545,9 @@ static void __init kvm_guest_init(void)
 		pv_time_ops.steal_clock = kvm_steal_clock;
 	}
 
+	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH))
+		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -598,6 +629,22 @@ static __init int activate_jump_labels(void)
 }
 arch_initcall(activate_jump_labels);
 
+static __init int kvm_setup_pv_tlb_flush(void)
+{
+	int cpu;
+
+	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) {
+		for_each_possible_cpu(cpu) {
+			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+				GFP_KERNEL, cpu_to_node(cpu));
+		}
+		pr_info("KVM setup pv remote TLB flush\n");
+	}
+
+	return 0;
+}
+arch_initcall(kvm_setup_pv_tlb_flush);
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */

From c2ba05ccfde2f069a66c0462e5b5ef8a517dcc9c Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 12 Dec 2017 17:33:03 -0800
Subject: [PATCH 146/676] KVM: X86: introduce invalidate_gpa argument to tlb
 flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new bool invalidate_gpa argument to kvm_x86_ops->tlb_flush,
it will be used by later patches to just flush guest tlb.

For VMX, this will use INVVPID instead of INVEPT, which will invalidate
combined mappings while keeping guest-physical mappings.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              | 14 +++++++-------
 arch/x86/kvm/vmx.c              | 21 +++++++++++----------
 arch/x86/kvm/x86.c              |  6 +++---
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 340e3604dcc77..44de261e9223d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -966,7 +966,7 @@ struct kvm_x86_ops {
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
-	void (*tlb_flush)(struct kvm_vcpu *vcpu);
+	void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 
 	void (*run)(struct kvm_vcpu *vcpu);
 	int (*handle_exit)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f3e7f210374a..14cca8c601a91 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -285,7 +285,7 @@ static int vgif = true;
 module_param(vgif, int, 0444);
 
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
@@ -2035,7 +2035,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return 1;
 
 	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
-		svm_flush_tlb(vcpu);
+		svm_flush_tlb(vcpu, true);
 
 	vcpu->arch.cr4 = cr4;
 	if (!npt_enabled)
@@ -2385,7 +2385,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 
 	svm->vmcb->control.nested_cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_NPT);
-	svm_flush_tlb(vcpu);
+	svm_flush_tlb(vcpu, true);
 }
 
 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
@@ -2989,7 +2989,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
 	svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
 	svm->nested.intercept            = nested_vmcb->control.intercept;
 
-	svm_flush_tlb(&svm->vcpu);
+	svm_flush_tlb(&svm->vcpu, true);
 	svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
 	if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
 		svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -4785,7 +4785,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
 	return 0;
 }
 
-static void svm_flush_tlb(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -5076,7 +5076,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
 	svm->vmcb->save.cr3 = __sme_set(root);
 	mark_dirty(svm->vmcb, VMCB_CR);
-	svm_flush_tlb(vcpu);
+	svm_flush_tlb(vcpu, true);
 }
 
 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -5090,7 +5090,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
 	mark_dirty(svm->vmcb, VMCB_CR);
 
-	svm_flush_tlb(vcpu);
+	svm_flush_tlb(vcpu, true);
 }
 
 static int is_disabled(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ef7d13e536a4f..c1791759f1e64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4140,9 +4140,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
+				bool invalidate_gpa)
 {
-	if (enable_ept) {
+	if (enable_ept && (invalidate_gpa || !enable_vpid)) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
 		ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
@@ -4151,15 +4152,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 	}
 }
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
-	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
 }
 
 static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
 {
 	if (enable_ept)
-		vmx_flush_tlb(vcpu);
+		vmx_flush_tlb(vcpu, true);
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -4357,7 +4358,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 		ept_load_pdptrs(vcpu);
 	}
 
-	vmx_flush_tlb(vcpu);
+	vmx_flush_tlb(vcpu, true);
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
@@ -7932,7 +7933,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
-	__vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+	__vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
 	nested_vmx_succeed(vcpu);
 
 	return kvm_skip_emulated_instruction(vcpu);
@@ -10614,11 +10615,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
-				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
 			}
 		} else {
 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-			vmx_flush_tlb(vcpu);
+			vmx_flush_tlb(vcpu, true);
 		}
 
 	}
@@ -11323,7 +11324,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 		 * L1's vpid. TODO: move to a more elaborate solution, giving
 		 * each L2 its own vpid and exposing the vpid feature to L1.
 		 */
-		vmx_flush_tlb(vcpu);
+		vmx_flush_tlb(vcpu, true);
 	}
 	/* Restore posted intr vector. */
 	if (nested_cpu_has_posted_intr(vmcs12))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 897f4795513fb..1e6d9f2d1f0bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6781,10 +6781,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
+	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
 }
 
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -6855,7 +6855,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
-			kvm_vcpu_flush_tlb(vcpu);
+			kvm_vcpu_flush_tlb(vcpu, true);
 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;

From f38a7b75267f1fb240a8178cbcb16d66dd37aac8 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 12 Dec 2017 17:33:04 -0800
Subject: [PATCH 147/676] KVM: X86: support paravirtualized help for TLB
 shootdowns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When running on a virtual machine, IPIs are expensive when the target
CPU is sleeping.  Thus, it is nice to be able to avoid them for TLB
shootdowns.  KVM can just do the flush via INVVPID on the guest's behalf
the next time the CPU is scheduled.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
[Use "&" to test the bit instead of "==". - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c |  3 ++-
 arch/x86/kvm/x86.c   | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3db9e1cd4f63b..a0e6c975f3a8b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -602,7 +602,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_PV_EOI) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
-			     (1 << KVM_FEATURE_PV_UNHALT);
+			     (1 << KVM_FEATURE_PV_UNHALT) |
+			     (1 << KVM_FEATURE_PV_TLB_FLUSH);
 
 		if (sched_info_on())
 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1e6d9f2d1f0bc..35d0b3de697bc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2122,6 +2122,12 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.pv_time_enabled = false;
 }
 
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+{
+	++vcpu->stat.tlb_flush;
+	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+}
+
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
@@ -2131,7 +2137,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
-	vcpu->arch.st.steal.preempted = 0;
+	/*
+	 * Doing a TLB flush here, on the guest's behalf, can avoid
+	 * expensive IPIs.
+	 */
+	if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+		kvm_vcpu_flush_tlb(vcpu, false);
 
 	if (vcpu->arch.st.steal.version & 1)
 		vcpu->arch.st.steal.version += 1;  /* first time write, random junk */
@@ -6781,12 +6792,6 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
-{
-	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
-}
-
 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
 		unsigned long start, unsigned long end)
 {

From efdab992813fb2ed825745625b83c05032e9cda2 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 13 Dec 2017 10:46:40 +0100
Subject: [PATCH 148/676] KVM: x86: fix escape of guest dr6 to the host
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

syzkaller reported:

   WARNING: CPU: 0 PID: 12927 at arch/x86/kernel/traps.c:780 do_debug+0x222/0x250
   CPU: 0 PID: 12927 Comm: syz-executor Tainted: G           OE    4.15.0-rc2+ #16
   RIP: 0010:do_debug+0x222/0x250
   Call Trace:
    <#DB>
    debug+0x3e/0x70
   RIP: 0010:copy_user_enhanced_fast_string+0x10/0x20
    </#DB>
    _copy_from_user+0x5b/0x90
    SyS_timer_create+0x33/0x80
    entry_SYSCALL_64_fastpath+0x23/0x9a

The testcase sets a watchpoint (with perf_event_open) on a buffer that is
passed to timer_create() as the struct sigevent argument.  In timer_create(),
copy_from_user()'s rep movsb triggers the BP.  The testcase also sets
the debug registers for the guest.

However, KVM only restores host debug registers when the host has active
watchpoints, which triggers a race condition when running the testcase with
multiple threads.  The guest's DR6.BS bit can escape to the host before
another thread invokes timer_create(), and do_debug() complains.

The fix is to respect do_debug()'s dr6 invariant when leaving KVM.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35d0b3de697bc..a3dd44bb6f1e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2961,6 +2961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	pagefault_enable();
 	kvm_x86_ops->vcpu_put(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
+	/*
+	 * If userspace has set any breakpoints or watchpoints, dr6 is restored
+	 * on every vmexit, but if not, we might have a stale dr6 from the
+	 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+	 */
+	set_debugreg(0, 6);
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,

From 476b7adaa3272557168b287175b1e9e943913404 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 Dec 2017 13:51:32 +0100
Subject: [PATCH 149/676] KVM: x86: avoid unnecessary XSETBV on guest entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xsetbv can be expensive when running on nested virtualization, try to
avoid it.

Reviewed-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Quan Xu <quan.xu0@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a3dd44bb6f1e6..56d8a1e11e50b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -702,7 +702,8 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
 			!vcpu->guest_xcr0_loaded) {
 		/* kvm_set_xcr() also depends on this */
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
 		vcpu->guest_xcr0_loaded = 1;
 	}
 }

From 505c9e94d8329dc215617cf99505629e924f001e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 11 Jan 2018 13:10:38 +0100
Subject: [PATCH 150/676] KVM: x86: prefer "depends on" to "select" for SEV
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid reverse dependencies.  Instead, SEV will only be enabled if
the PSP driver is available.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/Kconfig | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 148ea324b90c4..92fd433c50b9b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -85,9 +85,7 @@ config KVM_AMD_SEV
 	def_bool y
 	bool "AMD Secure Encrypted Virtualization (SEV) support"
 	depends on KVM_AMD && X86_64
-	select CRYPTO_DEV_CCP
-	select CRYPTO_DEV_CCP_DD
-	select CRYPTO_DEV_SP_PSP
+	depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP
 	---help---
 	Provides support for launching Encrypted VMs on AMD processors.
 

From b8d7044bcff7a955257b242515bcf1e5045edd9b Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Wed, 20 Dec 2017 15:29:28 +0800
Subject: [PATCH 151/676] x86/mm: add a function to check if a pfn is UC/UC-/WC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Check whether the PAT memory type of a pfn cannot be overridden by
MTRR UC memory type, i.e. the PAT memory type is UC, UC- or WC. This
function will be used by KVM to distinguish MMIO pfns and give them
UC memory type in the EPT page tables (on Intel processors, EPT
memory types work like MTRRs).

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reviewed-by: Xiao Guangrong <xiaoguangrong@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/pat.h |  2 ++
 arch/x86/mm/pat.c          | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 8a3ee355b4222..92015c65fa2ac 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -22,4 +22,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end,
 
 void io_free_memtype(resource_size_t start, resource_size_t end);
 
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
 #endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index fe7d57a8fb600..1555bd7d34493 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -677,6 +677,25 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
 	return rettype;
 }
 
+/**
+ * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
+ * of @pfn cannot be overridden by UC MTRR memory type.
+ *
+ * Only to be called when PAT is enabled.
+ *
+ * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
+ * Returns false in other cases.
+ */
+bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
+{
+	enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
+
+	return cm == _PAGE_CACHE_MODE_UC ||
+	       cm == _PAGE_CACHE_MODE_UC_MINUS ||
+	       cm == _PAGE_CACHE_MODE_WC;
+}
+EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
+
 /**
  * io_reserve_memtype - Request a memory type mapping for a region of memory
  * @start: start (physical address) of the region

From aa2e063aea7961fb58eafecb924f8818e3d4ecfc Mon Sep 17 00:00:00 2001
From: Haozhong Zhang <haozhong.zhang@intel.com>
Date: Wed, 20 Dec 2017 15:29:29 +0800
Subject: [PATCH 152/676] KVM: MMU: consider host cache mode in MMIO page check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some reserved pages, such as those from NVDIMM DAX devices, are not
for MMIO, and can be mapped with cached memory type for better
performance. However, the above check misconceives those pages as
MMIO.  Because KVM maps MMIO pages with UC memory type, the
performance of guest accesses to those pages would be harmed.
Therefore, we check the host memory type in addition and only treat
UC/UC-/WC pages as MMIO.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reported-by: Cuevas Escareno, Ivan D <ivan.d.cuevas.escareno@intel.com>
Reported-by: Kumar, Karthik <karthik.kumar@intel.com>
Reviewed-by: Xiao Guangrong <xiaoguangrong@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/mmu.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff1e9ee259cf7..1f1da400fcdef 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -42,6 +42,7 @@
 #include <linux/kern_levels.h>
 
 #include <asm/page.h>
+#include <asm/pat.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
@@ -2708,7 +2709,18 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 {
 	if (pfn_valid(pfn))
-		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
+			/*
+			 * Some reserved pages, such as those from NVDIMM
+			 * DAX devices, are not for MMIO, and can be mapped
+			 * with cached memory type for better performance.
+			 * However, the above check misconceives those pages
+			 * as MMIO, and results in KVM mapping them with UC
+			 * memory type, which would hurt the performance.
+			 * Therefore, we check the host memory type in addition
+			 * and only treat UC/UC-/WC pages as MMIO.
+			 */
+			(!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
 
 	return true;
 }

From a6cb099a43314ddb2b0214c122f61288d3d6c0ba Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 20 Dec 2017 12:50:28 +0100
Subject: [PATCH 153/676] kvm/vmx: Use local vmx variable in vmx_get_msr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... just like in vmx_set_msr().

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7b1d71665bc2f..14976cd4e0c10 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3248,6 +3248,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
  */
 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct shared_msr_entry *msr;
 
 	switch (msr_info->index) {
@@ -3259,8 +3260,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_readl(GUEST_GS_BASE);
 		break;
 	case MSR_KERNEL_GS_BASE:
-		vmx_load_host_state(to_vmx(vcpu));
-		msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+		vmx_load_host_state(vmx);
+		msr_info->data = vmx->msr_guest_kernel_gs_base;
 		break;
 #endif
 	case MSR_EFER:
@@ -3286,13 +3287,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		break;
 	case MSR_IA32_MCG_EXT_CTL:
 		if (!msr_info->host_initiated &&
-		    !(to_vmx(vcpu)->msr_ia32_feature_control &
+		    !(vmx->msr_ia32_feature_control &
 		      FEATURE_CONTROL_LMCE))
 			return 1;
 		msr_info->data = vcpu->arch.mcg_ext_ctl;
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
-		msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
+		msr_info->data = vmx->msr_ia32_feature_control;
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
@@ -3309,7 +3310,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
+		msr = find_msr_entry(vmx, msr_info->index);
 		if (msr) {
 			msr_info->data = msr->data;
 			break;

From 5c7d4f9ad39d980728b39752304ce10bb2960cbf Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 19 Nov 2017 18:25:43 +0200
Subject: [PATCH 154/676] KVM: nVMX: Fix bug of injecting L2 exception into L1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_clear_exception_queue() should clear pending exception.
This also includes exceptions which were only marked pending but not
yet injected. This is because exception.pending is used for both L1
and L2 to determine if an exception should be raised to guest.
Note that an exception which is pending but not yet injected will
be raised again once the guest will be resumed.

Consider the following scenario:
1) L0 KVM with ignore_msrs=false.
2) L1 prepare vmcs12 with the following:
    a) No intercepts on MSR (MSR_BITMAP exist and is filled with 0).
    b) No intercept for #GP.
    c) vmx-preemption-timer is configured.
3) L1 enters into L2.
4) L2 reads an unhandled MSR that exists in MSR_BITMAP
(such as 0x1fff).

L2 RDMSR could be handled as described below:
1) L2 exits to L0 on RDMSR and calls handle_rdmsr().
2) handle_rdmsr() calls kvm_inject_gp() which sets
KVM_REQ_EVENT, exception.pending=true and exception.injected=false.
3) vcpu_enter_guest() consumes KVM_REQ_EVENT and calls
inject_pending_event() which calls vmx_check_nested_events()
which sees that exception.pending=true but
nested_vmx_check_exception() returns 0 and therefore does nothing at
this point. However let's assume it later sees vmx-preemption-timer
expired and therefore exits from L2 to L1 by calling
nested_vmx_vmexit().
4) nested_vmx_vmexit() calls prepare_vmcs12()
which calls vmcs12_save_pending_event() but it does nothing as
exception.injected is false. Also prepare_vmcs12() calls
kvm_clear_exception_queue() which does nothing as
exception.injected is already false.
5) We now return from vmx_check_nested_events() with 0 while still
having exception.pending=true!
6) Therefore inject_pending_event() continues
and we inject L2 exception to L1!...

This commit will fix above issue by changing step (4) to
clear exception.pending in kvm_clear_exception_queue().

Fixes: 664f8e26b00c ("KVM: X86: Fix loss of exception which has not yet been injected")
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Signed-off-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 -
 arch/x86/kvm/x86.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14976cd4e0c10..866c867b558a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11052,7 +11052,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		if (block_nested_events)
 			return -EBUSY;
 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		vcpu->arch.exception.pending = false;
 		return 0;
 	}
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c69f973111cb2..b91215d1fd80d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -12,6 +12,7 @@
 
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.exception.pending = false;
 	vcpu->arch.exception.injected = false;
 }
 

From fa59cc003804f7606b3d0aa8e16509f9103f7377 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 24 Dec 2017 18:12:53 +0200
Subject: [PATCH 155/676] KVM: x86: Optimization: Create SVM stubs for
 sync_pir_to_irr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sync_pir_to_irr() is only called if vcpu->arch.apicv_active()==true.
In case it is false, VMX code make sure to set sync_pir_to_irr
to NULL.

Therefore, having SVM stubs allows to remove check for if
sync_pir_to_irr != NULL from all calling sites.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
[Return highest IRR in the SVM case. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/lapic.c |  2 +-
 arch/x86/kvm/svm.c   |  1 +
 arch/x86/kvm/x86.c   | 10 ++++------
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e2c1fb8d35cea..0928608750e37 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -581,7 +581,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
 	int highest_irr;
-	if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+	if (apic->vcpu->arch.apicv_active)
 		highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	else
 		highest_irr = apic_find_highest_irr(apic);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5d83f0474020c..b613d331d0310 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -6740,6 +6740,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.hwapic_irr_update = svm_hwapic_irr_update,
 	.hwapic_isr_update = svm_hwapic_isr_update,
+	.sync_pir_to_irr = kvm_lapic_find_highest_irr,
 	.apicv_post_state_restore = avic_post_state_restore,
 
 	.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6d2e1459adc97..0204b2b8a293f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2973,7 +2973,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+	if (vcpu->arch.apicv_active)
 		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
 	return kvm_apic_get_state(vcpu, s);
@@ -6820,7 +6820,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
 	else {
-		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+		if (vcpu->arch.apicv_active)
 			kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
 	}
@@ -7046,10 +7046,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * This handles the case where a posted interrupt was
 	 * notified with kvm_vcpu_kick.
 	 */
-	if (kvm_lapic_enabled(vcpu)) {
-		if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
-			kvm_x86_ops->sync_pir_to_irr(vcpu);
-	}
+	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
+		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
 	if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
 	    || need_resched() || signal_pending(current)) {

From e7387b0e27ec3a203bda4910dc4a107f6c5f912f Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 24 Dec 2017 18:12:54 +0200
Subject: [PATCH 156/676] KVM: x86: Change __kvm_apic_update_irr() to also
 return if max IRR updated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit doesn't change semantics.
It is done as a preparation for future commits.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/lapic.c | 23 ++++++++++++++++-------
 arch/x86/kvm/lapic.h |  4 ++--
 arch/x86/kvm/vmx.c   |  5 +++--
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0928608750e37..924ac8ce9d500 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -364,32 +364,41 @@ static u8 count_vectors(void *bitmap)
 	return count;
 }
 
-int __kvm_apic_update_irr(u32 *pir, void *regs)
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
 {
 	u32 i, vec;
-	u32 pir_val, irr_val;
-	int max_irr = -1;
+	u32 pir_val, irr_val, prev_irr_val;
+	int max_updated_irr;
+
+	max_updated_irr = -1;
+	*max_irr = -1;
 
 	for (i = vec = 0; i <= 7; i++, vec += 32) {
 		pir_val = READ_ONCE(pir[i]);
 		irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
 		if (pir_val) {
+			prev_irr_val = irr_val;
 			irr_val |= xchg(&pir[i], 0);
 			*((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
+			if (prev_irr_val != irr_val) {
+				max_updated_irr =
+					__fls(irr_val ^ prev_irr_val) + vec;
+			}
 		}
 		if (irr_val)
-			max_irr = __fls(irr_val) + vec;
+			*max_irr = __fls(irr_val) + vec;
 	}
 
-	return max_irr;
+	return ((max_updated_irr != -1) &&
+		(max_updated_irr == *max_irr));
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
 
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	return __kvm_apic_update_irr(pir, apic->regs);
+	return __kvm_apic_update_irr(pir, apic->regs, max_irr);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4b9935a383479..56c36014f7b76 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -75,8 +75,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode);
 
-int __kvm_apic_update_irr(u32 *pir, void *regs);
-int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 		     struct dest_map *dest_map);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 866c867b558a6..5ea482bb1b9c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5085,7 +5085,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
 	if (max_irr != 256) {
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
-		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
+			vapic_page, &max_irr);
 		kunmap(vmx->nested.virtual_apic_page);
 
 		status = vmcs_read16(GUEST_INTR_STATUS);
@@ -8986,7 +8987,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 		 * But on x86 this is just a compiler barrier anyway.
 		 */
 		smp_mb__after_atomic();
-		max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+		kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
 	} else {
 		max_irr = kvm_lapic_find_highest_irr(vcpu);
 	}

From f27a85c4988d408a2918a3bcbc3d7fe4fb2dc2b3 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 24 Dec 2017 18:12:55 +0200
Subject: [PATCH 157/676] KVM: nVMX: Re-evaluate L1 pending events when running
 L2 and L1 got posted-interrupt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case posted-interrupt was delivered to CPU while it is in host
(outside guest), then posted-interrupt delivery will be done by
calling sync_pir_to_irr() at vmentry after interrupts are disabled.

sync_pir_to_irr() will check vmx->pi_desc.control ON bit and if
set, it will sync vmx->pi_desc.pir to IRR and afterwards update RVI to
ensure virtual-interrupt-delivery will dispatch interrupt to guest.

However, it is possible that L1 will receive a posted-interrupt while
CPU runs at host and is about to enter L2. In this case, the call to
sync_pir_to_irr() will indeed update the L1's APIC IRR but
vcpu_enter_guest() will then just resume into L2 guest without
re-evaluating if it should exit from L2 to L1 as a result of this
new pending L1 event.

To address this case, if sync_pir_to_irr() has a new L1 injectable
interrupt and CPU is running L2, we force exit GUEST_MODE which will
result in another iteration of vcpu_run() run loop which will call
kvm_vcpu_running() which will call check_nested_events() which will
handle the pending L1 event properly.

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5ea482bb1b9c1..5fe94e375d2d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8978,6 +8978,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int max_irr;
+	bool max_irr_updated;
 
 	WARN_ON(!vcpu->arch.apicv_active);
 	if (pi_test_on(&vmx->pi_desc)) {
@@ -8987,7 +8988,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 		 * But on x86 this is just a compiler barrier anyway.
 		 */
 		smp_mb__after_atomic();
-		kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+		max_irr_updated =
+			kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+
+		/*
+		 * If we are running L2 and L1 has a new pending interrupt
+		 * which can be injected, we should re-evaluate
+		 * what should be done with this new L1 interrupt.
+		 */
+		if (is_guest_mode(vcpu) && max_irr_updated)
+			kvm_vcpu_exiting_guest_mode(vcpu);
 	} else {
 		max_irr = kvm_lapic_find_highest_irr(vcpu);
 	}

From 851c1a18c5412fd321e387cfe60739387cdbf37d Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 24 Dec 2017 18:12:56 +0200
Subject: [PATCH 158/676] KVM: nVMX: Fix injection to L2 when L1 don't
 intercept external-interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before each vmentry to guest, vcpu_enter_guest() calls sync_pir_to_irr()
which calls vmx_hwapic_irr_update() to update RVI.
Currently, vmx_hwapic_irr_update() contains a tweak in case it is called
when CPU is running L2 and L1 don't intercept external-interrupts.
In that case, code injects interrupt directly into L2 instead of
updating RVI.

Besides being hacky (wouldn't expect function updating RVI to also
inject interrupt), it also doesn't handle this case correctly.
The code contains several issues:
1. When code calls kvm_queue_interrupt() it just passes it max_irr which
represents the highest IRR currently pending in L1 LAPIC.
This is problematic as interrupt was injected to guest but it's bit is
still set in LAPIC IRR instead of being cleared from IRR and set in ISR.
2. Code doesn't check if LAPIC PPR is set to accept an interrupt of
max_irr priority. It just checks if interrupts are enabled in guest with
vmx_interrupt_allowed().

To fix the above issues:
1. Simplify vmx_hwapic_irr_update() to just update RVI.
Note that this shouldn't happen when CPU is running L2
(See comment in code).
2. Since now vmx_hwapic_irr_update() only does logic for L1
virtual-interrupt-delivery, inject_pending_event() should be the
one responsible for injecting the interrupt directly into L2.
Therefore, change kvm_cpu_has_injectable_intr() to check L1
LAPIC when CPU is running L2.
3. Change vmx_sync_pir_to_irr() to set KVM_REQ_EVENT when L1
has a pending injectable interrupt.

Fixes: 963fee165660 ("KVM: nVMX: Fix virtual interrupt delivery
injection")

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
Reviewed-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Liam Merwick <liam.merwick@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/irq.c |  2 +-
 arch/x86/kvm/vmx.c | 41 +++++++++++++++++------------------------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 5c24811e8b0bc..f171051eecf34 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -79,7 +79,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	if (kvm_vcpu_apicv_active(v))
+	if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v))
 		return 0;
 
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5fe94e375d2d5..2a96b1abe77c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8948,30 +8948,16 @@ static void vmx_set_rvi(int vector)
 
 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 {
-	if (!is_guest_mode(vcpu)) {
-		vmx_set_rvi(max_irr);
-		return;
-	}
-
-	if (max_irr == -1)
-		return;
-
 	/*
-	 * In guest mode.  If a vmexit is needed, vmx_check_nested_events
-	 * handles it.
+	 * When running L2, updating RVI is only relevant when
+	 * vmcs12 virtual-interrupt-delivery enabled.
+	 * However, it can be enabled only when L1 also
+	 * intercepts external-interrupts and in that case
+	 * we should not update vmcs02 RVI but instead intercept
+	 * interrupt. Therefore, do nothing when running L2.
 	 */
-	if (nested_exit_on_intr(vcpu))
-		return;
-
-	/*
-	 * Else, fall back to pre-APICv interrupt injection since L2
-	 * is run without virtual interrupt delivery.
-	 */
-	if (!kvm_event_needs_reinjection(vcpu) &&
-	    vmx_interrupt_allowed(vcpu)) {
-		kvm_queue_interrupt(vcpu, max_irr, false);
-		vmx_inject_irq(vcpu);
-	}
+	if (!is_guest_mode(vcpu))
+		vmx_set_rvi(max_irr);
 }
 
 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
@@ -8995,9 +8981,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 		 * If we are running L2 and L1 has a new pending interrupt
 		 * which can be injected, we should re-evaluate
 		 * what should be done with this new L1 interrupt.
+		 * If L1 intercepts external-interrupts, we should
+		 * exit from L2 to L1. Otherwise, interrupt should be
+		 * delivered directly to L2.
 		 */
-		if (is_guest_mode(vcpu) && max_irr_updated)
-			kvm_vcpu_exiting_guest_mode(vcpu);
+		if (is_guest_mode(vcpu) && max_irr_updated) {
+			if (nested_exit_on_intr(vcpu))
+				kvm_vcpu_exiting_guest_mode(vcpu);
+			else
+				kvm_make_request(KVM_REQ_EVENT, vcpu);
+		}
 	} else {
 		max_irr = kvm_lapic_find_highest_irr(vcpu);
 	}

From 6b6977117f50d60455ace86b2d256f6fb4f3de05 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Thu, 9 Nov 2017 20:27:20 +0200
Subject: [PATCH 159/676] KVM: nVMX: Fix races when sending nested PI while
 dest enters/leaves L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consider the following scenario:
1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI
to CPU B via virtual posted-interrupt mechanism.
2. CPU B is currently executing L2 guest.
3. vmx_deliver_nested_posted_interrupt() calls
kvm_vcpu_trigger_posted_interrupt() which will note that
vcpu->mode == IN_GUEST_MODE.
4. Assume that before CPU A sends the physical POSTED_INTR_NESTED_VECTOR
IPI, CPU B exits from L2 to L0 during event-delivery
(valid IDT-vectoring-info).
5. CPU A now sends the physical IPI. The IPI is received in host and
it's handler (smp_kvm_posted_intr_nested_ipi()) does nothing.
6. Assume that before CPU A sets pi_pending=true and KVM_REQ_EVENT,
CPU B continues to run in L0 and reach vcpu_enter_guest(). As
KVM_REQ_EVENT is not set yet, vcpu_enter_guest() will continue and resume
L2 guest.
7. At this point, CPU A sets pi_pending=true and KVM_REQ_EVENT but
it's too late! CPU B already entered L2 and KVM_REQ_EVENT will only be
consumed at next L2 entry!

Another scenario to consider:
1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI
to CPU B via virtual posted-interrupt mechanism.
2. Assume that before CPU A calls kvm_vcpu_trigger_posted_interrupt(),
CPU B is at L0 and is about to resume into L2. Further assume that it is
in vcpu_enter_guest() after check for KVM_REQ_EVENT.
3. At this point, CPU A calls kvm_vcpu_trigger_posted_interrupt() which
will note that vcpu->mode != IN_GUEST_MODE. Therefore, do nothing and
return false. Then, will set pi_pending=true and KVM_REQ_EVENT.
4. Now CPU B continue and resumes into L2 guest without processing
the posted-interrupt until next L2 entry!

To fix both issues, we just need to change
vmx_deliver_nested_posted_interrupt() to set pi_pending=true and
KVM_REQ_EVENT before calling kvm_vcpu_trigger_posted_interrupt().

It will fix the first scenario by chaging step (6) to note that
KVM_REQ_EVENT and pi_pending=true and therefore process
nested posted-interrupt.

It will fix the second scenario by two possible ways:
1. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B has changed
vcpu->mode to IN_GUEST_MODE, physical IPI will be sent and will be received
when CPU resumes into L2.
2. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B hasn't yet
changed vcpu->mode to IN_GUEST_MODE, then after CPU B will change
vcpu->mode it will call kvm_request_pending() which will return true and
therefore force another round of vcpu_enter_guest() which will note that
KVM_REQ_EVENT and pi_pending=true and therefore process nested
posted-interrupt.

Cc: stable@vger.kernel.org
Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
[Add kvm_vcpu_kick to also handle the case where L1 doesn't intercept L2 HLT
 and L2 executes HLT instruction. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2a96b1abe77c8..2c8749ee3221d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5146,14 +5146,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 
 	if (is_guest_mode(vcpu) &&
 	    vector == vmx->nested.posted_intr_nv) {
-		/* the PIR and ON have been set by L1. */
-		kvm_vcpu_trigger_posted_interrupt(vcpu, true);
 		/*
 		 * If a posted intr is not recognized by hardware,
 		 * we will accomplish it in the next vmentry.
 		 */
 		vmx->nested.pi_pending = true;
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		/* the PIR and ON have been set by L1. */
+		if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
+			kvm_vcpu_kick(vcpu);
 		return 0;
 	}
 	return -1;

From c5d167b27e00026711ad19a33a23d5d3d562148a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 Dec 2017 11:05:19 +0100
Subject: [PATCH 160/676] KVM: vmx: shadow more fields that are read/written on
 every vmexits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compared to when VMCS shadowing was added to KVM, we are reading/writing
a few more fields: the PML index, the interrupt status and the preemption
timer value.  The first two are because we are exposing more features
to nested guests, the preemption timer is simply because we have grown
a new optimization.  Adding them to the shadow VMCS field lists reduces
the cost of a vmexit by about 1000 clock cycles for each field that exists
on bare metal.

On the other hand, the guest BNDCFGS and TSC offset are not written on
fast paths, so remove them.

Suggested-by: Jim Mattson <jmattson@google.com>
Cc: Jim Mattson <jmattson@google.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2c8749ee3221d..7b114a293c663 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -726,11 +726,12 @@ static unsigned long shadow_read_write_fields[] = {
 	GUEST_CS_LIMIT,
 	GUEST_CS_BASE,
 	GUEST_ES_BASE,
-	GUEST_BNDCFGS,
+	GUEST_PML_INDEX,
+	GUEST_INTR_STATUS,
+	VMX_PREEMPTION_TIMER_VALUE,
 	CR0_GUEST_HOST_MASK,
 	CR0_READ_SHADOW,
 	CR4_READ_SHADOW,
-	TSC_OFFSET,
 	EXCEPTION_BITMAP,
 	CPU_BASED_VM_EXEC_CONTROL,
 	VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -3903,9 +3904,22 @@ static void init_vmcs_shadow_fields(void)
 	/* No checks for read only fields yet */
 
 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+		/*
+		 * PML and the preemption timer can be emulated, but the
+		 * processor cannot vmwrite to fields that don't exist
+		 * on bare metal.
+		 */
 		switch (shadow_read_write_fields[i]) {
-		case GUEST_BNDCFGS:
-			if (!kvm_mpx_supported())
+		case GUEST_PML_INDEX:
+			if (!cpu_has_vmx_pml())
+				continue;
+			break;
+		case VMX_PREEMPTION_TIMER_VALUE:
+			if (!cpu_has_vmx_preemption_timer())
+				continue;
+			break;
+		case GUEST_INTR_STATUS:
+			if (!cpu_has_vmx_apicv())
 				continue;
 			break;
 		default:
@@ -6802,11 +6816,6 @@ static __init int hardware_setup(void)
 		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;
 
-	if (!cpu_has_vmx_shadow_vmcs())
-		enable_shadow_vmcs = 0;
-	if (enable_shadow_vmcs)
-		init_vmcs_shadow_fields();
-
 	if (!cpu_has_vmx_ept() ||
 	    !cpu_has_vmx_ept_4levels() ||
 	    !cpu_has_vmx_ept_mt_wb() ||
@@ -6926,6 +6935,11 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->cancel_hv_timer = NULL;
 	}
 
+	if (!cpu_has_vmx_shadow_vmcs())
+		enable_shadow_vmcs = 0;
+	if (enable_shadow_vmcs)
+		init_vmcs_shadow_fields();
+
 	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
 
 	kvm_mce_cap_supported |= MCG_LMCE_P;

From 44900ba65e16ab3c6608e105654f38f54d030caa Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 Dec 2017 12:58:02 +0100
Subject: [PATCH 161/676] KVM: VMX: optimize shadow VMCS copying
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because all fields can be read/written with a single vmread/vmwrite on
64-bit kernels, the switch statements in copy_vmcs12_to_shadow and
copy_shadow_to_vmcs12 are unnecessary.

What I did in this patch is to copy the two parts of 64-bit fields
separately on 32-bit kernels, to keep all complicated #ifdef-ery
in init_vmcs_shadow_fields.  The disadvantage is that 64-bit fields
have to be listed separately in shadow_read_only/read_write_fields,
but those are few and we can validate the arrays when building the
VMREAD and VMWRITE bitmaps.  This saves a few hundred clock cycles
per nested vmexit.

However there is still a "switch" in vmcs_read_any and vmcs_write_any.
So, while at it, this patch reorders the fields by type, hoping that
the branch predictor appreciates it.

Cc: Jim Mattson <jmattson@google.com>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 143 +++++++++++++++++++++------------------------
 1 file changed, 65 insertions(+), 78 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7b114a293c663..bdd3f155efae0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -686,7 +686,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 				[number##_HIGH] = VMCS12_OFFSET(name)+4
 
 
-static unsigned long shadow_read_only_fields[] = {
+static u16 shadow_read_only_fields[] = {
 	/*
 	 * We do NOT shadow fields that are modified when L0
 	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
@@ -699,49 +699,59 @@ static unsigned long shadow_read_only_fields[] = {
 	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
 	 * by nested_vmx_failValid)
 	 */
+	/* 32-bits */
 	VM_EXIT_REASON,
 	VM_EXIT_INTR_INFO,
 	VM_EXIT_INSTRUCTION_LEN,
 	IDT_VECTORING_INFO_FIELD,
 	IDT_VECTORING_ERROR_CODE,
 	VM_EXIT_INTR_ERROR_CODE,
+
+	/* Natural width */
 	EXIT_QUALIFICATION,
 	GUEST_LINEAR_ADDRESS,
-	GUEST_PHYSICAL_ADDRESS
+
+	/* 64-bit */
+	GUEST_PHYSICAL_ADDRESS,
+	GUEST_PHYSICAL_ADDRESS_HIGH,
 };
 static int max_shadow_read_only_fields =
 	ARRAY_SIZE(shadow_read_only_fields);
 
-static unsigned long shadow_read_write_fields[] = {
+static u16 shadow_read_write_fields[] = {
+	/* 16-bits */
+	GUEST_CS_SELECTOR,
+	GUEST_INTR_STATUS,
+	GUEST_PML_INDEX,
+	HOST_FS_SELECTOR,
+	HOST_GS_SELECTOR,
+
+	/* 32-bits */
+	CPU_BASED_VM_EXEC_CONTROL,
+	EXCEPTION_BITMAP,
+	VM_ENTRY_EXCEPTION_ERROR_CODE,
+	VM_ENTRY_INTR_INFO_FIELD,
+	VM_ENTRY_INSTRUCTION_LEN,
 	TPR_THRESHOLD,
+	GUEST_CS_LIMIT,
+	GUEST_CS_AR_BYTES,
+	GUEST_INTERRUPTIBILITY_INFO,
+	VMX_PREEMPTION_TIMER_VALUE,
+
+	/* Natural width */
 	GUEST_RIP,
 	GUEST_RSP,
 	GUEST_CR0,
 	GUEST_CR3,
 	GUEST_CR4,
-	GUEST_INTERRUPTIBILITY_INFO,
 	GUEST_RFLAGS,
-	GUEST_CS_SELECTOR,
-	GUEST_CS_AR_BYTES,
-	GUEST_CS_LIMIT,
 	GUEST_CS_BASE,
 	GUEST_ES_BASE,
-	GUEST_PML_INDEX,
-	GUEST_INTR_STATUS,
-	VMX_PREEMPTION_TIMER_VALUE,
 	CR0_GUEST_HOST_MASK,
 	CR0_READ_SHADOW,
 	CR4_READ_SHADOW,
-	EXCEPTION_BITMAP,
-	CPU_BASED_VM_EXEC_CONTROL,
-	VM_ENTRY_EXCEPTION_ERROR_CODE,
-	VM_ENTRY_INTR_INFO_FIELD,
-	VM_ENTRY_INSTRUCTION_LEN,
-	VM_ENTRY_EXCEPTION_ERROR_CODE,
 	HOST_FS_BASE,
 	HOST_GS_BASE,
-	HOST_FS_SELECTOR,
-	HOST_GS_SELECTOR
 };
 static int max_shadow_read_write_fields =
 	ARRAY_SIZE(shadow_read_write_fields);
@@ -3901,15 +3911,39 @@ static void init_vmcs_shadow_fields(void)
 {
 	int i, j;
 
-	/* No checks for read only fields yet */
+	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
+		u16 field = shadow_read_only_fields[i];
+		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 &&
+		    (i + 1 == max_shadow_read_only_fields ||
+		     shadow_read_only_fields[i + 1] != field + 1))
+			pr_err("Missing field from shadow_read_only_field %x\n",
+			       field + 1);
+
+		clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+		if (field & 1)
+			continue;
+#endif
+		if (j < i)
+			shadow_read_only_fields[j] = field;
+		j++;
+	}
+	max_shadow_read_only_fields = j;
 
 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
+		u16 field = shadow_read_write_fields[i];
+		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 &&
+		    (i + 1 == max_shadow_read_write_fields ||
+		     shadow_read_write_fields[i + 1] != field + 1))
+			pr_err("Missing field from shadow_read_write_field %x\n",
+			       field + 1);
+
 		/*
 		 * PML and the preemption timer can be emulated, but the
 		 * processor cannot vmwrite to fields that don't exist
 		 * on bare metal.
 		 */
-		switch (shadow_read_write_fields[i]) {
+		switch (field) {
 		case GUEST_PML_INDEX:
 			if (!cpu_has_vmx_pml())
 				continue;
@@ -3926,31 +3960,17 @@ static void init_vmcs_shadow_fields(void)
 			break;
 		}
 
+		clear_bit(field, vmx_vmwrite_bitmap);
+		clear_bit(field, vmx_vmread_bitmap);
+#ifdef CONFIG_X86_64
+		if (field & 1)
+			continue;
+#endif
 		if (j < i)
-			shadow_read_write_fields[j] =
-				shadow_read_write_fields[i];
+			shadow_read_write_fields[j] = field;
 		j++;
 	}
 	max_shadow_read_write_fields = j;
-
-	/* shadowed fields guest access without vmexit */
-	for (i = 0; i < max_shadow_read_write_fields; i++) {
-		unsigned long field = shadow_read_write_fields[i];
-
-		clear_bit(field, vmx_vmwrite_bitmap);
-		clear_bit(field, vmx_vmread_bitmap);
-		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) {
-			clear_bit(field + 1, vmx_vmwrite_bitmap);
-			clear_bit(field + 1, vmx_vmread_bitmap);
-		}
-	}
-	for (i = 0; i < max_shadow_read_only_fields; i++) {
-		unsigned long field = shadow_read_only_fields[i];
-
-		clear_bit(field, vmx_vmread_bitmap);
-		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64)
-			clear_bit(field + 1, vmx_vmread_bitmap);
-	}
 }
 
 static __init int alloc_kvm_area(void)
@@ -7538,7 +7558,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 	unsigned long field;
 	u64 field_value;
 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
-	const unsigned long *fields = shadow_read_write_fields;
+	const u16 *fields = shadow_read_write_fields;
 	const int num_fields = max_shadow_read_write_fields;
 
 	preempt_disable();
@@ -7547,23 +7567,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 	for (i = 0; i < num_fields; i++) {
 		field = fields[i];
-		switch (vmcs_field_type(field)) {
-		case VMCS_FIELD_TYPE_U16:
-			field_value = vmcs_read16(field);
-			break;
-		case VMCS_FIELD_TYPE_U32:
-			field_value = vmcs_read32(field);
-			break;
-		case VMCS_FIELD_TYPE_U64:
-			field_value = vmcs_read64(field);
-			break;
-		case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-			field_value = vmcs_readl(field);
-			break;
-		default:
-			WARN_ON(1);
-			continue;
-		}
+		field_value = __vmcs_readl(field);
 		vmcs12_write_any(&vmx->vcpu, field, field_value);
 	}
 
@@ -7575,7 +7579,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-	const unsigned long *fields[] = {
+	const u16 *fields[] = {
 		shadow_read_write_fields,
 		shadow_read_only_fields
 	};
@@ -7594,24 +7598,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 		for (i = 0; i < max_fields[q]; i++) {
 			field = fields[q][i];
 			vmcs12_read_any(&vmx->vcpu, field, &field_value);
-
-			switch (vmcs_field_type(field)) {
-			case VMCS_FIELD_TYPE_U16:
-				vmcs_write16(field, (u16)field_value);
-				break;
-			case VMCS_FIELD_TYPE_U32:
-				vmcs_write32(field, (u32)field_value);
-				break;
-			case VMCS_FIELD_TYPE_U64:
-				vmcs_write64(field, (u64)field_value);
-				break;
-			case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-				vmcs_writel(field, (long)field_value);
-				break;
-			default:
-				WARN_ON(1);
-				break;
-			}
+			__vmcs_writel(field, field_value);
 		}
 	}
 

From 5b15706dbf5bdf0c9708ca9bb1f37e95e315daf2 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 22 Dec 2017 12:11:12 -0800
Subject: [PATCH 162/676] kvm: vmx: Introduce VMCS12_MAX_FIELD_INDEX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the highest index value used in any supported VMCS12 field
encoding. It is used to populate the IA32_VMX_VMCS_ENUM MSR.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bdd3f155efae0..bbfbed714decd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -407,6 +407,12 @@ struct __packed vmcs12 {
  */
 #define VMCS12_SIZE 0x1000
 
+/*
+ * VMCS12_MAX_FIELD_INDEX is the highest index value used in any
+ * supported VMCS12 field encoding.
+ */
+#define VMCS12_MAX_FIELD_INDEX 0x17
+
 /*
  * The nested_vmx structure is part of vcpu_vmx, and holds information we need
  * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -2923,7 +2929,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
 
 	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
-	vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+	vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
 }
 
 /*

From d37f4267a79f7d423103cb9fdd67a4a3a8194335 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 22 Dec 2017 12:12:16 -0800
Subject: [PATCH 163/676] kvm: vmx: Change vmcs_field_type to vmcs_field_width
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per the SDM, "[VMCS] Fields are grouped by width (16-bit, 32-bit,
etc.) and type (guest-state, host-state, etc.)." Previously, the width
was indicated by vmcs_field_type. To avoid confusion when we start
dealing with both field width and field type, change vmcs_field_type
to vmcs_field_width.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bbfbed714decd..f1d41dd3744c1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3894,17 +3894,17 @@ static void free_kvm_area(void)
 	}
 }
 
-enum vmcs_field_type {
-	VMCS_FIELD_TYPE_U16 = 0,
-	VMCS_FIELD_TYPE_U64 = 1,
-	VMCS_FIELD_TYPE_U32 = 2,
-	VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+enum vmcs_field_width {
+	VMCS_FIELD_WIDTH_U16 = 0,
+	VMCS_FIELD_WIDTH_U64 = 1,
+	VMCS_FIELD_WIDTH_U32 = 2,
+	VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
 };
 
-static inline int vmcs_field_type(unsigned long field)
+static inline int vmcs_field_width(unsigned long field)
 {
 	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
-		return VMCS_FIELD_TYPE_U32;
+		return VMCS_FIELD_WIDTH_U32;
 	return (field >> 13) & 0x3 ;
 }
 
@@ -3919,7 +3919,7 @@ static void init_vmcs_shadow_fields(void)
 
 	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
 		u16 field = shadow_read_only_fields[i];
-		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 &&
+		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 		    (i + 1 == max_shadow_read_only_fields ||
 		     shadow_read_only_fields[i + 1] != field + 1))
 			pr_err("Missing field from shadow_read_only_field %x\n",
@@ -3938,7 +3938,7 @@ static void init_vmcs_shadow_fields(void)
 
 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 		u16 field = shadow_read_write_fields[i];
-		if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 &&
+		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
 		    (i + 1 == max_shadow_read_write_fields ||
 		     shadow_read_write_fields[i + 1] != field + 1))
 			pr_err("Missing field from shadow_read_write_field %x\n",
@@ -7511,17 +7511,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 
 	p = ((char *)(get_vmcs12(vcpu))) + offset;
 
-	switch (vmcs_field_type(field)) {
-	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+	switch (vmcs_field_width(field)) {
+	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
 		*ret = *((natural_width *)p);
 		return 0;
-	case VMCS_FIELD_TYPE_U16:
+	case VMCS_FIELD_WIDTH_U16:
 		*ret = *((u16 *)p);
 		return 0;
-	case VMCS_FIELD_TYPE_U32:
+	case VMCS_FIELD_WIDTH_U32:
 		*ret = *((u32 *)p);
 		return 0;
-	case VMCS_FIELD_TYPE_U64:
+	case VMCS_FIELD_WIDTH_U64:
 		*ret = *((u64 *)p);
 		return 0;
 	default:
@@ -7538,17 +7538,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
 	if (offset < 0)
 		return offset;
 
-	switch (vmcs_field_type(field)) {
-	case VMCS_FIELD_TYPE_U16:
+	switch (vmcs_field_width(field)) {
+	case VMCS_FIELD_WIDTH_U16:
 		*(u16 *)p = field_value;
 		return 0;
-	case VMCS_FIELD_TYPE_U32:
+	case VMCS_FIELD_WIDTH_U32:
 		*(u32 *)p = field_value;
 		return 0;
-	case VMCS_FIELD_TYPE_U64:
+	case VMCS_FIELD_WIDTH_U64:
 		*(u64 *)p = field_value;
 		return 0;
-	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
 		*(natural_width *)p = field_value;
 		return 0;
 	default:

From 58e9ffae5efd3ee9b655ee36702f02c36d51ead9 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 22 Dec 2017 12:13:13 -0800
Subject: [PATCH 164/676] kvm: vmx: Reduce size of vmcs_field_to_offset_table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The vmcs_field_to_offset_table was a rather sparse table of short
integers with a maximum index of 0x6c16, amounting to 55342 bytes. Now
that we are considering support for multiple VMCS12 formats, it would
be unfortunate to replicate that large, sparse table. Rotating the
field encoding (as a 16-bit integer) left by 6 reduces that table to
5926 bytes.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f1d41dd3744c1..d49bb747ebea5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -686,10 +686,12 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 	return &(to_vmx(vcpu)->pi_desc);
 }
 
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
-#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
-#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
-				[number##_HIGH] = VMCS12_OFFSET(name)+4
+#define FIELD(number, name)	[ROL16(number, 6)] = VMCS12_OFFSET(name)
+#define FIELD64(number, name)						\
+	FIELD(number, name),						\
+	[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
 
 
 static u16 shadow_read_only_fields[] = {
@@ -908,9 +910,13 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 
 static inline short vmcs_field_to_offset(unsigned long field)
 {
-	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+	unsigned index;
+
+	if (field >> 15)
+		return -ENOENT;
 
-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+	index = ROL16(field, 6);
+	if (index >= ARRAY_SIZE(vmcs_field_to_offset_table))
 		return -ENOENT;
 
 	/*
@@ -919,10 +925,10 @@ static inline short vmcs_field_to_offset(unsigned long field)
 	 */
 	asm("lfence");
 
-	if (vmcs_field_to_offset_table[field] == 0)
+	if (vmcs_field_to_offset_table[index] == 0)
 		return -ENOENT;
 
-	return vmcs_field_to_offset_table[field];
+	return vmcs_field_to_offset_table[index];
 }
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)

From c9e9deae76b8fbbd48e7357247689016bb0d6242 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Dec 2017 13:16:29 +0100
Subject: [PATCH 165/676] KVM: VMX: split list of shadowed VMCS field to a
 separate file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for multiple inclusions of the list.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c               | 64 ++--------------------------
 arch/x86/kvm/vmx_shadow_fields.h | 71 ++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 60 deletions(-)
 create mode 100644 arch/x86/kvm/vmx_shadow_fields.h

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d49bb747ebea5..4530b2ba63b98 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -695,71 +695,15 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 
 
 static u16 shadow_read_only_fields[] = {
-	/*
-	 * We do NOT shadow fields that are modified when L0
-	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
-	 * VMXON...) executed by L1.
-	 * For example, VM_INSTRUCTION_ERROR is read
-	 * by L1 if a vmx instruction fails (part of the error path).
-	 * Note the code assumes this logic. If for some reason
-	 * we start shadowing these fields then we need to
-	 * force a shadow sync when L0 emulates vmx instructions
-	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
-	 * by nested_vmx_failValid)
-	 */
-	/* 32-bits */
-	VM_EXIT_REASON,
-	VM_EXIT_INTR_INFO,
-	VM_EXIT_INSTRUCTION_LEN,
-	IDT_VECTORING_INFO_FIELD,
-	IDT_VECTORING_ERROR_CODE,
-	VM_EXIT_INTR_ERROR_CODE,
-
-	/* Natural width */
-	EXIT_QUALIFICATION,
-	GUEST_LINEAR_ADDRESS,
-
-	/* 64-bit */
-	GUEST_PHYSICAL_ADDRESS,
-	GUEST_PHYSICAL_ADDRESS_HIGH,
+#define SHADOW_FIELD_RO(x) x,
+#include "vmx_shadow_fields.h"
 };
 static int max_shadow_read_only_fields =
 	ARRAY_SIZE(shadow_read_only_fields);
 
 static u16 shadow_read_write_fields[] = {
-	/* 16-bits */
-	GUEST_CS_SELECTOR,
-	GUEST_INTR_STATUS,
-	GUEST_PML_INDEX,
-	HOST_FS_SELECTOR,
-	HOST_GS_SELECTOR,
-
-	/* 32-bits */
-	CPU_BASED_VM_EXEC_CONTROL,
-	EXCEPTION_BITMAP,
-	VM_ENTRY_EXCEPTION_ERROR_CODE,
-	VM_ENTRY_INTR_INFO_FIELD,
-	VM_ENTRY_INSTRUCTION_LEN,
-	TPR_THRESHOLD,
-	GUEST_CS_LIMIT,
-	GUEST_CS_AR_BYTES,
-	GUEST_INTERRUPTIBILITY_INFO,
-	VMX_PREEMPTION_TIMER_VALUE,
-
-	/* Natural width */
-	GUEST_RIP,
-	GUEST_RSP,
-	GUEST_CR0,
-	GUEST_CR3,
-	GUEST_CR4,
-	GUEST_RFLAGS,
-	GUEST_CS_BASE,
-	GUEST_ES_BASE,
-	CR0_GUEST_HOST_MASK,
-	CR0_READ_SHADOW,
-	CR4_READ_SHADOW,
-	HOST_FS_BASE,
-	HOST_GS_BASE,
+#define SHADOW_FIELD_RW(x) x,
+#include "vmx_shadow_fields.h"
 };
 static int max_shadow_read_write_fields =
 	ARRAY_SIZE(shadow_read_write_fields);
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
new file mode 100644
index 0000000000000..31d7a15338acd
--- /dev/null
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -0,0 +1,71 @@
+#ifndef SHADOW_FIELD_RO
+#define SHADOW_FIELD_RO(x)
+#endif
+#ifndef SHADOW_FIELD_RW
+#define SHADOW_FIELD_RW(x)
+#endif
+
+/*
+ * We do NOT shadow fields that are modified when L0
+ * traps and emulates any vmx instruction (e.g. VMPTRLD,
+ * VMXON...) executed by L1.
+ * For example, VM_INSTRUCTION_ERROR is read
+ * by L1 if a vmx instruction fails (part of the error path).
+ * Note the code assumes this logic. If for some reason
+ * we start shadowing these fields then we need to
+ * force a shadow sync when L0 emulates vmx instructions
+ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+ * by nested_vmx_failValid)
+ *
+ * Keeping the fields ordered by size is an attempt at improving
+ * branch prediction in vmcs_read_any and vmcs_write_any.
+ */
+
+/* 16-bits */
+SHADOW_FIELD_RW(GUEST_CS_SELECTOR)
+SHADOW_FIELD_RW(GUEST_INTR_STATUS)
+SHADOW_FIELD_RW(GUEST_PML_INDEX)
+SHADOW_FIELD_RW(HOST_FS_SELECTOR)
+SHADOW_FIELD_RW(HOST_GS_SELECTOR)
+
+/* 32-bits */
+SHADOW_FIELD_RO(VM_EXIT_REASON)
+SHADOW_FIELD_RO(VM_EXIT_INTR_INFO)
+SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN)
+SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD)
+SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE)
+SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE)
+SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL)
+SHADOW_FIELD_RW(EXCEPTION_BITMAP)
+SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE)
+SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD)
+SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN)
+SHADOW_FIELD_RW(TPR_THRESHOLD)
+SHADOW_FIELD_RW(GUEST_CS_LIMIT)
+SHADOW_FIELD_RW(GUEST_CS_AR_BYTES)
+SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO)
+SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE)
+
+/* Natural width */
+SHADOW_FIELD_RO(EXIT_QUALIFICATION)
+SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS)
+SHADOW_FIELD_RW(GUEST_RIP)
+SHADOW_FIELD_RW(GUEST_RSP)
+SHADOW_FIELD_RW(GUEST_CR0)
+SHADOW_FIELD_RW(GUEST_CR3)
+SHADOW_FIELD_RW(GUEST_CR4)
+SHADOW_FIELD_RW(GUEST_RFLAGS)
+SHADOW_FIELD_RW(GUEST_CS_BASE)
+SHADOW_FIELD_RW(GUEST_ES_BASE)
+SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK)
+SHADOW_FIELD_RW(CR0_READ_SHADOW)
+SHADOW_FIELD_RW(CR4_READ_SHADOW)
+SHADOW_FIELD_RW(HOST_FS_BASE)
+SHADOW_FIELD_RW(HOST_GS_BASE)
+
+/* 64-bit */
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS)
+SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH)
+
+#undef SHADOW_FIELD_RO
+#undef SHADOW_FIELD_RW

From 74a497fae754bd970ad0573c26d0a3c5e784fa0c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Dec 2017 13:55:39 +0100
Subject: [PATCH 166/676] KVM: nVMX: track dirty state of non-shadowed VMCS
 fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VMCS12 fields that are not handled through shadow VMCS are rarely
written, and thus they are also almost constant in the vmcs02.  We can
thus optimize prepare_vmcs02 by skipping all the work for non-shadowed
fields in the common case.

This patch introduces the (pretty simple) tracking infrastructure; the
next patches will move work to prepare_vmcs02_full and save a few hundred
clock cycles per VMRESUME on a Haswell Xeon E5 system:

	                                before  after
	cpuid                           14159   13869
	vmcall                          15290   14951
	inl_from_kernel                 17703   17447
	outl_to_kernel                  16011   14692
	self_ipi_sti_nop                16763   15825
	self_ipi_tpr_sti_nop            17341   15935
	wr_tsc_adjust_msr               14510   14264
	rd_tsc_adjust_msr               15018   14311
	mmio-wildcard-eventfd:pci-mem   16381   14947
	mmio-datamatch-eventfd:pci-mem  18620   17858
	portio-wildcard-eventfd:pci-io  15121   14769
	portio-datamatch-eventfd:pci-io 15761   14831

(average savings 748, stdev 460).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c               | 29 ++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx_shadow_fields.h |  6 ++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4530b2ba63b98..1cc787f639726 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -436,6 +436,7 @@ struct nested_vmx {
 	 * data hold by vmcs12
 	 */
 	bool sync_shadow_vmcs;
+	bool dirty_vmcs12;
 
 	bool change_vmcs01_virtual_x2apic_mode;
 	/* L2 must run next, and mustn't decide to exit to L1. */
@@ -7623,8 +7624,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 {
 	unsigned long field;
 	gva_t gva;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
 	/* The value to write might be 32 or 64 bits, depending on L1's long
 	 * mode, and eventually we need to write that into a field of several
 	 * possible lengths. The code below first zero-extends the value to 64
@@ -7667,6 +7670,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 		return kvm_skip_emulated_instruction(vcpu);
 	}
 
+	switch (field) {
+#define SHADOW_FIELD_RW(x) case x:
+#include "vmx_shadow_fields.h"
+		/*
+		 * The fields that can be updated by L1 without a vmexit are
+		 * always updated in the vmcs02, the others go down the slow
+		 * path of prepare_vmcs02.
+		 */
+		break;
+	default:
+		vmx->nested.dirty_vmcs12 = true;
+		break;
+	}
+
 	nested_vmx_succeed(vcpu);
 	return kvm_skip_emulated_instruction(vcpu);
 }
@@ -7681,6 +7698,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
 			     __pa(vmx->vmcs01.shadow_vmcs));
 		vmx->nested.sync_shadow_vmcs = true;
 	}
+	vmx->nested.dirty_vmcs12 = true;
 }
 
 /* Emulate the VMPTRLD instruction */
@@ -10297,6 +10315,11 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 	return 0;
 }
 
+static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			       bool from_vmentry)
+{
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -10592,7 +10615,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 			vmx_flush_tlb(vcpu, true);
 		}
-
 	}
 
 	if (enable_pml) {
@@ -10641,6 +10663,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
 	vmx_set_efer(vcpu, vcpu->arch.efer);
 
+	if (vmx->nested.dirty_vmcs12) {
+		prepare_vmcs02_full(vcpu, vmcs12, from_vmentry);
+		vmx->nested.dirty_vmcs12 = false;
+	}
+
 	/* Shadow page tables on either EPT or shadow page tables. */
 	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
 				entry_failure_code))
diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h
index 31d7a15338acd..cd0c75f6d037a 100644
--- a/arch/x86/kvm/vmx_shadow_fields.h
+++ b/arch/x86/kvm/vmx_shadow_fields.h
@@ -17,6 +17,12 @@
  * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
  * by nested_vmx_failValid)
  *
+ * When adding or removing fields here, note that shadowed
+ * fields must always be synced by prepare_vmcs02, not just
+ * prepare_vmcs02_full.
+ */
+
+/*
  * Keeping the fields ordered by size is an attempt at improving
  * branch prediction in vmcs_read_any and vmcs_write_any.
  */

From 8665c3f973203269fdc799da833f28c5dc217523 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Dec 2017 13:56:53 +0100
Subject: [PATCH 167/676] KVM: nVMX: initialize descriptor cache fields in
 prepare_vmcs02_full
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This part is separate for ease of review, because git prefers to move
prepare_vmcs02 below the initial long sequence of vmcs_write* operations.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 56 ++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cc787f639726..f83f5e7bf87a4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10317,28 +10317,10 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 
 static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			       bool from_vmentry)
-{
-}
-
-/*
- * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
- * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
- * guest in a way that will both be appropriate to L1's requests, and our
- * needs. In addition to modifying the active vmcs (which is vmcs02), this
- * function also has additional necessary side-effects, like setting various
- * vcpu->arch fields.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
- */
-static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-			  bool from_vmentry, u32 *entry_failure_code)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control, vmcs12_exec_ctrl;
 
 	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
-	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
 	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
 	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
 	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
@@ -10346,7 +10328,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
 	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
 	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
-	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
 	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
 	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
 	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
@@ -10356,15 +10337,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
 	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
 	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
-	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
 	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
 	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
 	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
 	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
 	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
 	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
-	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
-	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
 	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
 	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
 	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
@@ -10373,6 +10351,40 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+			  bool from_vmentry, u32 *entry_failure_code)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exec_control, vmcs12_exec_ctrl;
+
+	/*
+	 * First, the fields that are shadowed.  This must be kept in sync
+	 * with vmx_shadow_fields.h.
+	 */
+
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+
+	/*
+	 * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
+	 * HOST_FS_BASE, HOST_GS_BASE.
+	 */
 
 	if (from_vmentry &&
 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {

From 25a2e4fe8ef719de564370349dc408ac7263f455 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Dec 2017 14:05:21 +0100
Subject: [PATCH 168/676] KVM: nVMX: initialize more non-shadowed fields in
 prepare_vmcs02_full
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These fields are also simple copies of the data in the vmcs12 struct.
For some of them, prepare_vmcs02 was skipping the copy when the field
was unused.  In prepare_vmcs02_full, we copy them always as long as the
field exists on the host, because the corresponding execution control
might be one of the shadowed fields.

Optimization opportunities remain for MSRs that, depending on the
entry/exit controls, have to be copied from either the vmcs01 or
the vmcs12: EFER (whose value is partly stored in the entry controls
too), PAT, DEBUGCTL (and also DR7).  Before moving these three and
the entry/exit controls to prepare_vmcs02_full, KVM would have to set
dirty_vmcs12 on writes to the L1 MSRs.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 162 +++++++++++++++++++++++----------------------
 1 file changed, 83 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f83f5e7bf87a4..9b9e02c0f25e2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10351,6 +10351,88 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		vmcs12->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+	if (nested_cpu_has_xsaves(vmcs12))
+		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+	if (cpu_has_vmx_posted_intr())
+		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
+
+	/*
+	 * Whether page-faults are trapped is determined by a combination of
+	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+	 * If enable_ept, L0 doesn't care about page faults and we should
+	 * set all of these to L1's desires. However, if !enable_ept, L0 does
+	 * care about (at least some) page faults, and because it is not easy
+	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
+	 * to exit on each and every L2 page fault. This is done by setting
+	 * MASK=MATCH=0 and (see below) EB.PF=1.
+	 * Note that below we don't need special code to set EB.PF beyond the
+	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+	 */
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+	/* All VMFUNCs are currently emulated through L0 vmexits.  */
+	if (cpu_has_vmx_vmfunc())
+		vmcs_write64(VM_FUNCTION_CONTROL, 0);
+
+	if (cpu_has_vmx_apicv()) {
+		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
+		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
+		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
+		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
+	}
+
+	/*
+	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+	 * Some constant fields are set here by vmx_set_constant_host_state().
+	 * Other fields are different per CPU, and will be set later when
+	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+	 */
+	vmx_set_constant_host_state(vmx);
+
+	/*
+	 * Set the MSR load/store lists to match L0's settings.
+	 */
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+	set_cr4_guest_host_mask(vmx);
+
+	if (vmx_mpx_supported())
+		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+
+	if (enable_vpid) {
+		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+		else
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+	}
+
+	/*
+	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
+	 */
+	if (enable_ept) {
+		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+	}
 }
 
 /*
@@ -10408,16 +10490,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	} else {
 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 	}
-	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
-	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-		vmcs12->guest_pending_dbg_exceptions);
-	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
-	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
-
-	if (nested_cpu_has_xsaves(vmcs12))
-		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
-	vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
 	exec_control = vmcs12->pin_based_vm_exec_control;
 
@@ -10431,7 +10504,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
 		vmx->nested.pi_pending = false;
-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
 	} else {
 		exec_control &= ~PIN_BASED_POSTED_INTR;
 	}
@@ -10442,25 +10514,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (nested_cpu_has_preemption_timer(vmcs12))
 		vmx_start_preemption_timer(vcpu);
 
-	/*
-	 * Whether page-faults are trapped is determined by a combination of
-	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-	 * If enable_ept, L0 doesn't care about page faults and we should
-	 * set all of these to L1's desires. However, if !enable_ept, L0 does
-	 * care about (at least some) page faults, and because it is not easy
-	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
-	 * to exit on each and every L2 page fault. This is done by setting
-	 * MASK=MATCH=0 and (see below) EB.PF=1.
-	 * Note that below we don't need special code to set EB.PF beyond the
-	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
-	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
-	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
-	 */
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-		enable_ept ? vmcs12->page_fault_error_code_match : 0);
-
 	if (cpu_has_secondary_exec_ctrls()) {
 		exec_control = vmx->secondary_exec_control;
 
@@ -10479,22 +10532,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 			exec_control |= vmcs12_exec_ctrl;
 		}
 
-		/* All VMFUNCs are currently emulated through L0 vmexits.  */
-		if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC)
-			vmcs_write64(VM_FUNCTION_CONTROL, 0);
-
-		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
-			vmcs_write64(EOI_EXIT_BITMAP0,
-				vmcs12->eoi_exit_bitmap0);
-			vmcs_write64(EOI_EXIT_BITMAP1,
-				vmcs12->eoi_exit_bitmap1);
-			vmcs_write64(EOI_EXIT_BITMAP2,
-				vmcs12->eoi_exit_bitmap2);
-			vmcs_write64(EOI_EXIT_BITMAP3,
-				vmcs12->eoi_exit_bitmap3);
+		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
 			vmcs_write16(GUEST_INTR_STATUS,
 				vmcs12->guest_intr_status);
-		}
 
 		/*
 		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
@@ -10507,24 +10547,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
 
-
-	/*
-	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
-	 * Some constant fields are set here by vmx_set_constant_host_state().
-	 * Other fields are different per CPU, and will be set later when
-	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
-	 */
-	vmx_set_constant_host_state(vmx);
-
-	/*
-	 * Set the MSR load/store lists to match L0's settings.
-	 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
-	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
-
 	/*
 	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
 	 * entry, but only if the current (host) sp changed from the value
@@ -10594,12 +10616,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 	}
 
-	set_cr4_guest_host_mask(vmx);
-
-	if (from_vmentry &&
-	    vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
-		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
-
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
 		vmcs_write64(TSC_OFFSET,
 			vcpu->arch.tsc_offset + vmcs12->tsc_offset);
@@ -10618,13 +10634,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 		 * even if spawn a lot of nested vCPUs.
 		 */
 		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
-			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
 				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true);
 			}
 		} else {
-			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 			vmx_flush_tlb(vcpu, true);
 		}
 	}
@@ -10688,16 +10702,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (!enable_ept)
 		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 
-	/*
-	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
-	 */
-	if (enable_ept) {
-		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
-		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
-		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
-		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
-	}
-
 	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
 	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 	return 0;

From 07f36616cde482a9fd65da9a64f504190c7a0edb Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 1 Jan 2018 22:53:42 +0100
Subject: [PATCH 169/676] KVM: nVMX: remove unnecessary vmwrite from L2->L1
 vmexit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The POSTED_INTR_NV field is constant (though it differs between the vmcs01 and
vmcs02), there is no need to reload it on vmexit to L1.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b9e02c0f25e2..bbadd8c7e5922 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11339,9 +11339,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 		 */
 		vmx_flush_tlb(vcpu, true);
 	}
-	/* Restore posted intr vector. */
-	if (nested_cpu_has_posted_intr(vmcs12))
-		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
 
 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);

From 1f6e5b25643e539e55a4c7553e4be6562c88fb76 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Dec 2017 11:32:16 +0100
Subject: [PATCH 170/676] KVM: vmx: simplify MSR bitmap setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The APICv-enabled MSR bitmap is a superset of the APICv-disabled bitmap.
Make that obvious in vmx_disable_intercept_msr_x2apic.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Resolved rebase conflict after removing Intel PT. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bbadd8c7e5922..1f4a3037b99e0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5017,14 +5017,13 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 						msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
+static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only)
 {
-	if (apicv_active) {
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
-				msr, type);
-		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
-				msr, type);
-	} else {
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+					msr, type);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+					msr, type);
+	if (!apicv_only) {
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
 				msr, type);
 		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
@@ -6872,7 +6871,6 @@ static __init int hardware_setup(void)
 	 * TPR reads and writes can be virtualized even if virtual interrupt
 	 * delivery is not in use.
 	 */
-	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
 	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
 
 	/* EOI */

From c992384bde84fb2f0318bdb4f6c710605dd7a217 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 13 Dec 2017 14:16:30 +0100
Subject: [PATCH 171/676] KVM: vmx: speed up MSR bitmap merge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bulk of the MSR bitmap is either immutable, or can be copied from
the L1 bitmap.  By initializing it at VMXON time, and copying the mutable
parts one long at a time on vmentry (rather than one bit), about 4000
clock cycles (30%) can be saved on a nested VMLAUNCH/VMRESUME.

The resulting for loop only has four iterations, so it is cheap enough
to reinitialize the MSR write bitmaps on every iteration, and it makes
the code simpler.

Suggested-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 78 +++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1f4a3037b99e0..71b2c93544236 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4972,11 +4972,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
 {
 	int f = sizeof(unsigned long);
 
-	if (!cpu_has_vmx_msr_bitmap()) {
-		WARN_ON(1);
-		return;
-	}
-
 	/*
 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
 	 * have the write-low and read-high bitmap offsets the wrong way round.
@@ -7177,6 +7172,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 				(unsigned long *)__get_free_page(GFP_KERNEL);
 		if (!vmx->nested.msr_bitmap)
 			goto out_msr_bitmap;
+		memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE);
 	}
 
 	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
@@ -9844,8 +9840,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 	}
 }
 
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
-					       struct vmcs12 *vmcs12);
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+						 struct vmcs12 *vmcs12);
 
 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12)
@@ -9934,11 +9930,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 			(unsigned long)(vmcs12->posted_intr_desc_addr &
 			(PAGE_SIZE - 1)));
 	}
-	if (cpu_has_vmx_msr_bitmap() &&
-	    nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
-	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-		;
-	else
+	if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
 		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
 				CPU_BASED_USE_MSR_BITMAPS);
 }
@@ -10006,14 +9998,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
  * Merge L0's and L1's MSR bitmap, return false to indicate that
  * we do not use the hardware.
  */
-static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
-					       struct vmcs12 *vmcs12)
+static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
+						 struct vmcs12 *vmcs12)
 {
 	int msr;
 	struct page *page;
 	unsigned long *msr_bitmap_l1;
 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
 
+	/* Nothing to do if the MSR bitmap is not in use.  */
+	if (!cpu_has_vmx_msr_bitmap() ||
+	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+		return false;
+
 	/* This shortcut is ok because we support only x2APIC MSRs so far. */
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
 		return false;
@@ -10021,32 +10018,41 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
 	if (is_error_page(page))
 		return false;
-	msr_bitmap_l1 = (unsigned long *)kmap(page);
 
-	memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
+	msr_bitmap_l1 = (unsigned long *)kmap(page);
+	if (nested_cpu_has_apic_reg_virt(vmcs12)) {
+		/*
+		 * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it
+		 * just lets the processor take the value from the virtual-APIC page;
+		 * take those 256 bits directly from the L1 bitmap.
+		 */
+		for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+			unsigned word = msr / BITS_PER_LONG;
+			msr_bitmap_l0[word] = msr_bitmap_l1[word];
+			msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+		}
+	} else {
+		for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+			unsigned word = msr / BITS_PER_LONG;
+			msr_bitmap_l0[word] = ~0;
+			msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0;
+		}
+	}
 
-	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
-		if (nested_cpu_has_apic_reg_virt(vmcs12))
-			for (msr = 0x800; msr <= 0x8ff; msr++)
-				nested_vmx_disable_intercept_for_msr(
-					msr_bitmap_l1, msr_bitmap_l0,
-					msr, MSR_TYPE_R);
+	nested_vmx_disable_intercept_for_msr(
+		msr_bitmap_l1, msr_bitmap_l0,
+		APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+		MSR_TYPE_W);
 
+	if (nested_cpu_has_vid(vmcs12)) {
 		nested_vmx_disable_intercept_for_msr(
-				msr_bitmap_l1, msr_bitmap_l0,
-				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
-				MSR_TYPE_R | MSR_TYPE_W);
-
-		if (nested_cpu_has_vid(vmcs12)) {
-			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap_l1, msr_bitmap_l0,
-				APIC_BASE_MSR + (APIC_EOI >> 4),
-				MSR_TYPE_W);
-			nested_vmx_disable_intercept_for_msr(
-				msr_bitmap_l1, msr_bitmap_l0,
-				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
-				MSR_TYPE_W);
-		}
+			msr_bitmap_l1, msr_bitmap_l0,
+			APIC_BASE_MSR + (APIC_EOI >> 4),
+			MSR_TYPE_W);
+		nested_vmx_disable_intercept_for_msr(
+			msr_bitmap_l1, msr_bitmap_l0,
+			APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+			MSR_TYPE_W);
 	}
 	kunmap(page);
 	kvm_release_page_clean(page);

From d7231e75f73f424068ad205358bd8ba6a7a2e113 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Dec 2017 00:47:55 +0100
Subject: [PATCH 172/676] KVM: VMX: introduce X2APIC_MSR macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove duplicate expression in nested_vmx_prepare_msr_bitmap, and make
the register names clearer in hardware_setup.

Suggested-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[Resolved rebase conflict after removing Intel PT. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 71b2c93544236..1e2ca9e8662ff 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5012,6 +5012,8 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 						msr, MSR_TYPE_R | MSR_TYPE_W);
 }
 
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
+
 static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only)
 {
 	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
@@ -6857,7 +6859,7 @@ static __init int hardware_setup(void)
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
 	for (msr = 0x800; msr <= 0x8ff; msr++) {
-		if (msr == 0x839 /* TMCCT */)
+		if (msr == X2APIC_MSR(APIC_TMCCT))
 			continue;
 		vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
 	}
@@ -6866,12 +6868,9 @@ static __init int hardware_setup(void)
 	 * TPR reads and writes can be virtualized even if virtual interrupt
 	 * delivery is not in use.
 	 */
-	vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
-
-	/* EOI */
-	vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
-	/* SELF-IPI */
-	vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
+	vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false);
+	vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true);
+	vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true);
 
 	if (enable_ept)
 		vmx_enable_tdp();
@@ -10041,17 +10040,17 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 
 	nested_vmx_disable_intercept_for_msr(
 		msr_bitmap_l1, msr_bitmap_l0,
-		APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+		X2APIC_MSR(APIC_TASKPRI),
 		MSR_TYPE_W);
 
 	if (nested_cpu_has_vid(vmcs12)) {
 		nested_vmx_disable_intercept_for_msr(
 			msr_bitmap_l1, msr_bitmap_l0,
-			APIC_BASE_MSR + (APIC_EOI >> 4),
+			X2APIC_MSR(APIC_EOI),
 			MSR_TYPE_W);
 		nested_vmx_disable_intercept_for_msr(
 			msr_bitmap_l1, msr_bitmap_l0,
-			APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+			X2APIC_MSR(APIC_SELF_IPI),
 			MSR_TYPE_W);
 	}
 	kunmap(page);

From 43ff3f65234061e08d234bdef5a9aadc19832b74 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 14:31:43 +1100
Subject: [PATCH 173/676] KVM: PPC: Book3S HV: Make sure we don't re-enter
 guest without XIVE loaded

This fixes a bug where it is possible to enter a guest on a POWER9
system without having the XIVE (interrupt controller) context loaded.
This can happen because we unload the XIVE context from the CPU
before doing the real-mode handling for machine checks.  After the
real-mode handler runs, it is possible that we re-enter the guest
via a fast path which does not load the XIVE context.

To fix this, we move the unloading of the XIVE context to come after
the real-mode machine check handler is called.

Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 40 ++++++++++++-------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c8ffd69adfec7..76332a3f6c0dc 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1423,6 +1423,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+	/* Save more register state  */
+	mfdar	r6
+	mfdsisr	r7
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	/* don't overwrite fault_dar/fault_dsisr if HDSI */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	mc_cont
+	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* See if it is a machine check */
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	machine_check_realmode
+mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
 #ifdef CONFIG_KVM_XICS
 	/* We are exiting, pull the VP from the XIVE */
 	lwz	r0, VCPU_XIVE_PUSHED(r9)
@@ -1460,26 +1480,6 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	eieio
 1:
 #endif /* CONFIG_KVM_XICS */
-	/* Save more register state  */
-	mfdar	r6
-	mfdsisr	r7
-	std	r6, VCPU_DAR(r9)
-	stw	r7, VCPU_DSISR(r9)
-	/* don't overwrite fault_dar/fault_dsisr if HDSI */
-	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-	beq	mc_cont
-	std	r6, VCPU_FAULT_DAR(r9)
-	stw	r7, VCPU_FAULT_DSISR(r9)
-
-	/* See if it is a machine check */
-	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
-	beq	machine_check_realmode
-mc_cont:
-#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
-	addi	r3, r9, VCPU_TB_RMEXIT
-	mr	r4, r9
-	bl	kvmhv_accumulate_time
-#endif
 
 	mr 	r3, r12
 	/* Increment exit count, poke other threads to exit */

From 6964e6a4e4894c707e42d51d9d30683c57f43201 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 14:51:02 +1100
Subject: [PATCH 174/676] KVM: PPC: Book3S HV: Do SLB load/unload with guest
 LPCR value loaded

This moves the code that loads and unloads the guest SLB values so that
it is done while the guest LPCR value is loaded in the LPCR register.
The reason for doing this is that on POWER9, the behaviour of the
slbmte instruction depends on the LPCR[UPRT] bit.  If UPRT is 1, as
it is for a radix host (or guest), the SLB index is truncated to
2 bits.  This means that for a HPT guest on a radix host, the SLB
was not being loaded correctly, causing the guest to crash.

The SLB is now loaded much later in the guest entry path, after the
LPCR is loaded, which for a secondary thread is after it sees that
the primary thread has switched the MMU to the guest.  The loop that
waits for the primary thread has a branch out to the exit code that
is taken if it sees that other threads have commenced exiting the
guest.  Since we have now not loaded the SLB at this point, we make
this path branch to a new label 'guest_bypass' and we move the SLB
unload code to before this label.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 109 ++++++++++++------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 76332a3f6c0dc..30ece4cebaf53 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -617,13 +617,6 @@ kvmppc_hv_entry:
 	lbz	r0, KVM_RADIX(r9)
 	cmpwi	cr7, r0, 0
 
-	/* Clear out SLB if hash */
-	bne	cr7, 2f
-	li	r6,0
-	slbmte	r6,r6
-	slbia
-	ptesync
-2:
 	/*
 	 * POWER7/POWER8 host -> guest partition switch code.
 	 * We don't have to lock against concurrent tlbies,
@@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 10:	cmpdi	r4, 0
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
-
-	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
-	lwz	r5,VCPU_SLB_MAX(r4)
-	cmpwi	r5,0
-	beq	9f
-	mtctr	r5
-	addi	r6,r4,VCPU_SLB
-1:	ld	r8,VCPU_SLB_E(r6)
-	ld	r9,VCPU_SLB_V(r6)
-	slbmte	r9,r8
-	addi	r6,r6,VCPU_SLB_SIZE
-	bdnz	1b
-9:
 	/* Increment yield count if they have a VPA */
 	ld	r3, VCPU_VPA(r4)
 	cmpdi	r3, 0
@@ -1017,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpdi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+	/* For hash guest, clear out and reload the SLB */
+	ld	r6, VCPU_KVM(r4)
+	lbz	r0, KVM_RADIX(r6)
+	cmpwi	r0, 0
+	bne	9f
+	li	r6, 0
+	slbmte	r6, r6
+	slbia
+	ptesync
+
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
 #ifdef CONFIG_KVM_XICS
 	/* We are entering the guest on that thread, push VCPU to XIVE */
 	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
@@ -1193,7 +1196,7 @@ hdec_soon:
 	addi	r3, r4, VCPU_TB_RMEXIT
 	bl	kvmhv_accumulate_time
 #endif
-	b	guest_exit_cont
+	b	guest_bypass
 
 /******************************************************************************
  *                                                                            *
@@ -1481,34 +1484,12 @@ mc_cont:
 1:
 #endif /* CONFIG_KVM_XICS */
 
-	mr 	r3, r12
-	/* Increment exit count, poke other threads to exit */
-	bl	kvmhv_commence_exit
-	nop
-	ld	r9, HSTATE_KVM_VCPU(r13)
-	lwz	r12, VCPU_TRAP(r9)
-
-	/* Stop others sending VCPU interrupts to this physical CPU */
-	li	r0, -1
-	stw	r0, VCPU_CPU(r9)
-	stw	r0, VCPU_THREAD_CPU(r9)
-
-	/* Save guest CTRL register, set runlatch to 1 */
-	mfspr	r6,SPRN_CTRLF
-	stw	r6,VCPU_CTRL(r9)
-	andi.	r0,r6,1
-	bne	4f
-	ori	r6,r6,1
-	mtspr	SPRN_CTRLT,r6
-4:
-	/* Check if we are running hash or radix and store it in cr2 */
+	/* For hash guest, read the guest SLB and save it away */
 	ld	r5, VCPU_KVM(r9)
 	lbz	r0, KVM_RADIX(r5)
-	cmpwi	cr2,r0,0
-
-	/* Read the guest SLB and save it away */
 	li	r5, 0
-	bne	cr2, 3f			/* for radix, save 0 entries */
+	cmpwi	r0, 0
+	bne	3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
@@ -1524,8 +1505,34 @@ mc_cont:
 	addi	r5,r5,1
 2:	addi	r6,r6,1
 	bdnz	1b
+	/* Finally clear out the SLB */
+	li	r0,0
+	slbmte	r0,r0
+	slbia
+	ptesync
 3:	stw	r5,VCPU_SLB_MAX(r9)
 
+guest_bypass:
+	mr 	r3, r12
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Stop others sending VCPU interrupts to this physical CPU */
+	li	r0, -1
+	stw	r0, VCPU_CPU(r9)
+	stw	r0, VCPU_THREAD_CPU(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
 	/*
 	 * Save the guest PURR/SPURR
 	 */
@@ -1803,7 +1810,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	ld	r5, VCPU_KVM(r9)
 	lbz	r0, KVM_RADIX(r5)
 	cmpwi	cr2, r0, 0
-	beq	cr2, 3f
+	beq	cr2, 4f
 
 	/* Radix: Handle the case where the guest used an illegal PID */
 	LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -1839,15 +1846,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 BEGIN_FTR_SECTION
 	PPC_INVALIDATE_ERAT
 END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
-	b	4f
+4:
 #endif /* CONFIG_PPC_RADIX_MMU */
 
-	/* Hash: clear out SLB */
-3:	li	r5,0
-	slbmte	r5,r5
-	slbia
-	ptesync
-4:
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do

From 8dc31ff9298963425f5c4bb6011bc2bedb76b1e9 Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 19 Dec 2017 04:03:57 -0600
Subject: [PATCH 175/676] target core: add device action configfs files

This patch adds a new group of files that are to be used to
have the kernel module execution some action. The next patch
will have target_core_user use the group/files to be able to block
a device and to reset its memory buffer used to pass commands
between user/kernel space.

This type of file is different from the existing device attributes
in that they may be write only and when written to they result in
the kernel module executing some function. These need to be
separate from the normal device attributes which get/set device
values so userspace can continue to loop over all the attribs and
get/set them during initialization.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_configfs.c | 6 ++++++
 drivers/target/target_core_internal.h | 1 +
 include/target/target_core_backend.h  | 1 +
 include/target/target_core_base.h     | 1 +
 4 files changed, 9 insertions(+)

diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 72b1cd1bf9d9f..3f4bf126eed06 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -1197,6 +1197,7 @@ struct configfs_attribute *passthrough_attrib_attrs[] = {
 EXPORT_SYMBOL(passthrough_attrib_attrs);
 
 TB_CIT_SETUP_DRV(dev_attrib, NULL, NULL);
+TB_CIT_SETUP_DRV(dev_action, NULL, NULL);
 
 /* End functions for struct config_item_type tb_dev_attrib_cit */
 
@@ -2940,6 +2941,10 @@ static struct config_group *target_core_make_subdev(
 
 	config_group_init_type_name(&dev->dev_group, name, &tb->tb_dev_cit);
 
+	config_group_init_type_name(&dev->dev_action_group, "action",
+			&tb->tb_dev_action_cit);
+	configfs_add_default_group(&dev->dev_action_group, &dev->dev_group);
+
 	config_group_init_type_name(&dev->dev_attrib.da_group, "attrib",
 			&tb->tb_dev_attrib_cit);
 	configfs_add_default_group(&dev->dev_attrib.da_group, &dev->dev_group);
@@ -3200,6 +3205,7 @@ static const struct config_item_type target_core_cit = {
 void target_setup_backend_cits(struct target_backend *tb)
 {
 	target_core_setup_dev_cit(tb);
+	target_core_setup_dev_action_cit(tb);
 	target_core_setup_dev_attrib_cit(tb);
 	target_core_setup_dev_pr_cit(tb);
 	target_core_setup_dev_wwn_cit(tb);
diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h
index 6d53d9fcb883b..1d5afc3ae017c 100644
--- a/drivers/target/target_core_internal.h
+++ b/drivers/target/target_core_internal.h
@@ -17,6 +17,7 @@ struct target_backend {
 
 	struct config_item_type tb_dev_cit;
 	struct config_item_type tb_dev_attrib_cit;
+	struct config_item_type tb_dev_action_cit;
 	struct config_item_type tb_dev_pr_cit;
 	struct config_item_type tb_dev_wwn_cit;
 	struct config_item_type tb_dev_alua_tg_pt_gps_cit;
diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h
index b6b3fb444a920..34a15d59ed887 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -53,6 +53,7 @@ struct target_backend_ops {
 	void (*free_prot)(struct se_device *);
 
 	struct configfs_attribute **tb_dev_attrib_attrs;
+	struct configfs_attribute **tb_dev_action_attrs;
 };
 
 struct sbc_ops {
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index 00482f903decb..9f9f5902af386 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -809,6 +809,7 @@ struct se_device {
 	/* T10 SPC-2 + SPC-3 Reservations */
 	struct t10_reservation	t10_pr;
 	struct se_dev_attrib	dev_attrib;
+	struct config_group	dev_action_group;
 	struct config_group	dev_group;
 	struct config_group	dev_pr_group;
 	struct se_dev_stat_grps dev_stat_grps;

From 892782caf19a97ccc95df51b3bb659ecacff986a Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Tue, 19 Dec 2017 04:03:58 -0600
Subject: [PATCH 176/676] tcmu: allow userspace to reset ring

This patch adds 2 tcmu attrs to block/unblock a device and
reset the ring buffer. They are used when the userspace
daemon has crashed or forced to shutdown while IO is executing.
On restart, the daemon can block the device so new IO is not
sent to userspace while it puts the ring in a clean state.

Notes: The reset ring opreation is specific to tcmu, but the
block one could be generic. I kept it tcmu specific, because
it requires some extra locking/state checks in the main IO
path and since other backend modules did not need this
functionality I thought only tcmu should take the perf hit.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 170 +++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 4 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 60c8a87b7a882..511168bec1593 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -121,6 +121,7 @@ struct tcmu_dev {
 
 #define TCMU_DEV_BIT_OPEN 0
 #define TCMU_DEV_BIT_BROKEN 1
+#define TCMU_DEV_BIT_BLOCKED 2
 	unsigned long flags;
 
 	struct uio_info uio_info;
@@ -875,6 +876,11 @@ static sense_reason_t queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, int *scsi_err)
 
 	*scsi_err = TCM_NO_SENSE;
 
+	if (test_bit(TCMU_DEV_BIT_BLOCKED, &udev->flags)) {
+		*scsi_err = TCM_LUN_BUSY;
+		return -1;
+	}
+
 	if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) {
 		*scsi_err = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 		return -1;
@@ -1260,7 +1266,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	return &udev->se_dev;
 }
 
-static bool run_cmdr_queue(struct tcmu_dev *udev)
+static bool run_cmdr_queue(struct tcmu_dev *udev, bool fail)
 {
 	struct tcmu_cmd *tcmu_cmd, *tmp_cmd;
 	LIST_HEAD(cmds);
@@ -1271,7 +1277,7 @@ static bool run_cmdr_queue(struct tcmu_dev *udev)
 	if (list_empty(&udev->cmdr_queue))
 		return true;
 
-	pr_debug("running %s's cmdr queue\n", udev->name);
+	pr_debug("running %s's cmdr queue forcefail %d\n", udev->name, fail);
 
 	list_splice_init(&udev->cmdr_queue, &cmds);
 
@@ -1281,6 +1287,20 @@ static bool run_cmdr_queue(struct tcmu_dev *udev)
 	        pr_debug("removing cmd %u on dev %s from queue\n",
 		         tcmu_cmd->cmd_id, udev->name);
 
+		if (fail) {
+			idr_remove(&udev->commands, tcmu_cmd->cmd_id);
+			/*
+			 * We were not able to even start the command, so
+			 * fail with busy to allow a retry in case runner
+			 * was only temporarily down. If the device is being
+			 * removed then LIO core will do the right thing and
+			 * fail the retry.
+			 */
+			target_complete_cmd(tcmu_cmd->se_cmd, SAM_STAT_BUSY);
+			tcmu_free_cmd(tcmu_cmd);
+			continue;
+		}
+
 		ret = queue_cmd_ring(tcmu_cmd, &scsi_ret);
 		if (ret < 0) {
 		        pr_debug("cmd %u on dev %s failed with %u\n",
@@ -1317,7 +1337,7 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
 
 	mutex_lock(&udev->cmdr_lock);
 	tcmu_handle_completions(udev);
-	run_cmdr_queue(udev);
+	run_cmdr_queue(udev, false);
 	mutex_unlock(&udev->cmdr_lock);
 
 	return 0;
@@ -1801,6 +1821,78 @@ static void tcmu_destroy_device(struct se_device *dev)
 	kref_put(&udev->kref, tcmu_dev_kref_release);
 }
 
+static void tcmu_unblock_dev(struct tcmu_dev *udev)
+{
+	mutex_lock(&udev->cmdr_lock);
+	clear_bit(TCMU_DEV_BIT_BLOCKED, &udev->flags);
+	mutex_unlock(&udev->cmdr_lock);
+}
+
+static void tcmu_block_dev(struct tcmu_dev *udev)
+{
+	mutex_lock(&udev->cmdr_lock);
+
+	if (test_and_set_bit(TCMU_DEV_BIT_BLOCKED, &udev->flags))
+		goto unlock;
+
+	/* complete IO that has executed successfully */
+	tcmu_handle_completions(udev);
+	/* fail IO waiting to be queued */
+	run_cmdr_queue(udev, true);
+
+unlock:
+	mutex_unlock(&udev->cmdr_lock);
+}
+
+static void tcmu_reset_ring(struct tcmu_dev *udev, u8 err_level)
+{
+	struct tcmu_mailbox *mb;
+	struct tcmu_cmd *cmd;
+	int i;
+
+	mutex_lock(&udev->cmdr_lock);
+
+	idr_for_each_entry(&udev->commands, cmd, i) {
+		if (!list_empty(&cmd->cmdr_queue_entry))
+			continue;
+
+		pr_debug("removing cmd %u on dev %s from ring (is expired %d)\n",
+			  cmd->cmd_id, udev->name,
+			  test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags));
+
+		idr_remove(&udev->commands, i);
+		if (!test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
+			if (err_level == 1) {
+				/*
+				 * Userspace was not able to start the
+				 * command or it is retryable.
+				 */
+				target_complete_cmd(cmd->se_cmd, SAM_STAT_BUSY);
+			} else {
+				/* hard failure */
+				target_complete_cmd(cmd->se_cmd,
+						    SAM_STAT_CHECK_CONDITION);
+			}
+		}
+		tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
+		tcmu_free_cmd(cmd);
+	}
+
+	mb = udev->mb_addr;
+	tcmu_flush_dcache_range(mb, sizeof(*mb));
+	pr_debug("mb last %u head %u tail %u\n", udev->cmdr_last_cleaned,
+		 mb->cmd_tail, mb->cmd_head);
+
+	udev->cmdr_last_cleaned = 0;
+	mb->cmd_tail = 0;
+	mb->cmd_head = 0;
+	tcmu_flush_dcache_range(mb, sizeof(*mb));
+
+	del_timer(&udev->cmd_timer);
+
+	mutex_unlock(&udev->cmdr_lock);
+}
+
 enum {
 	Opt_dev_config, Opt_dev_size, Opt_hw_block_size, Opt_hw_max_sectors,
 	Opt_nl_reply_supported, Opt_max_data_area_mb, Opt_err,
@@ -2192,6 +2284,70 @@ static ssize_t tcmu_emulate_write_cache_store(struct config_item *item,
 }
 CONFIGFS_ATTR(tcmu_, emulate_write_cache);
 
+static ssize_t tcmu_block_dev_show(struct config_item *item, char *page)
+{
+	struct se_device *se_dev = container_of(to_config_group(item),
+						struct se_device,
+						dev_action_group);
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+
+	if (test_bit(TCMU_DEV_BIT_BLOCKED, &udev->flags))
+		return snprintf(page, PAGE_SIZE, "%s\n", "blocked");
+	else
+		return snprintf(page, PAGE_SIZE, "%s\n", "unblocked");
+}
+
+static ssize_t tcmu_block_dev_store(struct config_item *item, const char *page,
+				    size_t count)
+{
+	struct se_device *se_dev = container_of(to_config_group(item),
+						struct se_device,
+						dev_action_group);
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	u8 val;
+	int ret;
+
+	ret = kstrtou8(page, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val > 1) {
+		pr_err("Invalid block value %d\n", val);
+		return -EINVAL;
+	}
+
+	if (!val)
+		tcmu_unblock_dev(udev);
+	else
+		tcmu_block_dev(udev);
+	return count;
+}
+CONFIGFS_ATTR(tcmu_, block_dev);
+
+static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
+				     size_t count)
+{
+	struct se_device *se_dev = container_of(to_config_group(item),
+						struct se_device,
+						dev_action_group);
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	u8 val;
+	int ret;
+
+	ret = kstrtou8(page, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val != 1 && val != 2) {
+		pr_err("Invalid reset ring value %d\n", val);
+		return -EINVAL;
+	}
+
+	tcmu_reset_ring(udev, val);
+	return count;
+}
+CONFIGFS_ATTR_WO(tcmu_, reset_ring);
+
 static struct configfs_attribute *tcmu_attrib_attrs[] = {
 	&tcmu_attr_cmd_time_out,
 	&tcmu_attr_qfull_time_out,
@@ -2205,6 +2361,12 @@ static struct configfs_attribute *tcmu_attrib_attrs[] = {
 
 static struct configfs_attribute **tcmu_attrs;
 
+static struct configfs_attribute *tcmu_action_attrs[] = {
+	&tcmu_attr_block_dev,
+	&tcmu_attr_reset_ring,
+	NULL,
+};
+
 static struct target_backend_ops tcmu_ops = {
 	.name			= "user",
 	.owner			= THIS_MODULE,
@@ -2220,7 +2382,7 @@ static struct target_backend_ops tcmu_ops = {
 	.show_configfs_dev_params = tcmu_show_configfs_dev_params,
 	.get_device_type	= sbc_get_device_type,
 	.get_blocks		= tcmu_get_blocks,
-	.tb_dev_attrib_attrs	= NULL,
+	.tb_dev_action_attrs	= tcmu_action_attrs,
 };
 
 static void find_free_blocks(void)

From 0482b50546b12e82346903500af081ed329f915c Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Wed, 17 Jan 2018 12:35:27 +0100
Subject: [PATCH 177/676] arm64: mm: Add additional parameter to
 uaccess_ttbr0_disable

Add an extra temporary register parameter to uaccess_ttbr0_disable which
is about to be required for arm64 PAN support.

This patch doesn't introduce any functional change but ensures that the
kernel compiles once the KVM/ARM tree is merged with the arm64 tree by
ensuring a trivially mergable conflict with commit
6b88a32c7af68895134872cdec3b6bfdb532d94e
("arm64: kpti: Fix the interaction between ASID switching and software PAN").

Cc: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm64/include/asm/asm-uaccess.h | 8 ++++----
 arch/arm64/lib/clear_user.S          | 2 +-
 arch/arm64/lib/copy_from_user.S      | 2 +-
 arch/arm64/lib/copy_in_user.S        | 2 +-
 arch/arm64/lib/copy_to_user.S        | 2 +-
 arch/arm64/mm/cache.S                | 4 ++--
 arch/arm64/xen/hypercall.S           | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index b67563d2024c7..03064261ee0bb 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -25,7 +25,7 @@
 	isb
 	.endm
 
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 alternative_if_not ARM64_HAS_PAN
 	__uaccess_ttbr0_disable \tmp1
 alternative_else_nop_endif
@@ -39,7 +39,7 @@ alternative_if_not ARM64_HAS_PAN
 alternative_else_nop_endif
 	.endm
 #else
-	.macro	uaccess_ttbr0_disable, tmp1
+	.macro	uaccess_ttbr0_disable, tmp1, tmp2
 	.endm
 
 	.macro	uaccess_ttbr0_enable, tmp1, tmp2, tmp3
@@ -49,8 +49,8 @@ alternative_else_nop_endif
 /*
  * These macros are no-ops when UAO is present.
  */
-	.macro	uaccess_disable_not_uao, tmp1
-	uaccess_ttbr0_disable \tmp1
+	.macro	uaccess_disable_not_uao, tmp1, tmp2
+	uaccess_ttbr0_disable \tmp1, \tmp2
 alternative_if ARM64_ALT_PAN_NOT_UAO
 	SET_PSTATE_PAN(1)
 alternative_else_nop_endif
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index e88fb99c15616..8932e5f7a6f39 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -50,7 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2
 	b.mi	5f
 uao_user_alternative 9f, strb, sttrb, wzr, x0, 0
 5:	mov	x0, #0
-	uaccess_disable_not_uao x2
+	uaccess_disable_not_uao x2, x3
 	ret
 ENDPROC(__clear_user)
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 4b5d826895ff1..bc108634992c1 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -67,7 +67,7 @@ ENTRY(__arch_copy_from_user)
 	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index b24a830419ad9..e6dd59dd40534 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -68,7 +68,7 @@ ENTRY(raw_copy_in_user)
 	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(raw_copy_in_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 351f0766f7a61..bd20f9f7dd842 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -66,7 +66,7 @@ ENTRY(__arch_copy_to_user)
 	uaccess_enable_not_uao x3, x4
 	add	end, x0, x2
 #include "copy_template.S"
-	uaccess_disable_not_uao x3
+	uaccess_disable_not_uao x3, x4
 	mov	x0, #0
 	ret
 ENDPROC(__arch_copy_to_user)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 5a52811f47e9d..758bde7e2fa68 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -63,7 +63,7 @@ user_alt 9f, "dc cvau, x4",  "dc civac, x4",  ARM64_WORKAROUND_CLEAN_CACHE
 	invalidate_icache_by_line x0, x1, x2, x3, 9f
 	mov	x0, #0
 1:
-	uaccess_ttbr0_disable x1
+	uaccess_ttbr0_disable x1, x2
 	ret
 9:
 	mov	x0, #-EFAULT
@@ -85,7 +85,7 @@ ENTRY(invalidate_icache_range)
 	invalidate_icache_by_line x0, x1, x2, x3, 2f
 	mov	x0, xzr
 1:
-	uaccess_ttbr0_disable x1
+	uaccess_ttbr0_disable x1, x2
 	ret
 2:
 	mov	x0, #-EFAULT
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index acdbd2c9e899c..c5f05c4a4d008 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -107,6 +107,6 @@ ENTRY(privcmd_call)
 	/*
 	 * Disable userspace access from kernel once the hyp call completed.
 	 */
-	uaccess_ttbr0_disable x6
+	uaccess_ttbr0_disable x6, x7
 	ret
 ENDPROC(privcmd_call);

From 00608e1f007e4cf6031485c5630e0e504bceef9b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 16:54:26 +1100
Subject: [PATCH 178/676] KVM: PPC: Book3S HV: Allow HPT and radix on the same
 core for POWER9 v2.2

POWER9 chip versions starting with "Nimbus" v2.2 can support running
with some threads of a core in HPT mode and others in radix mode.
This means that we don't have to prohibit independent-threads mode
when running a HPT guest on a radix host, and we don't have to do any
of the synchronization between threads that was introduced in commit
c01015091a77 ("KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix
hosts", 2017-10-19).

Rather than using up another CPU feature bit, we just do an
explicit test on the PVR (processor version register) at module
startup time to determine whether we have to take steps to avoid
having some threads in HPT mode and some in radix mode (so-called
"mixed mode").  We test for "Nimbus" (indicated by 0 or 1 in the top
nibble of the lower 16 bits) v2.2 or later, or "Cumulus" (indicated by
2 or 3 in that nibble) v1.1 or later.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b2d448c750086..76cf48051eb3e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* If set, the threads on each CPU core have to be in the same MMU mode */
+static bool no_mixing_hpt_and_radix;
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -2386,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
 static bool subcore_config_ok(int n_subcores, int n_threads)
 {
 	/*
-	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
-	 * mode, with one thread per subcore.
+	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
+	 * split-core mode, with one thread per subcore.
 	 */
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
 		return n_subcores <= 4 && n_threads == 1;
@@ -2423,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
 		return false;
 
-	/* POWER9 currently requires all threads to be in the same MMU mode */
-	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	/* Some POWER9 chips require all threads to be in the same MMU mode */
+	if (no_mixing_hpt_and_radix &&
 	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
 		return false;
 
@@ -2687,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
 	 * On POWER9, we need to be not in independent-threads mode if
-	 * this is a HPT guest on a radix host.
+	 * this is a HPT guest on a radix host machine where the
+	 * CPU threads may not be in different MMU modes.
 	 */
-	hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+	hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
+		!kvm_is_radix(vc->kvm);
 	if (((controlled_threads > 1) &&
 	     ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
 	    (hpt_on_radix && vc->kvm->arch.threads_indep)) {
@@ -4446,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void)
 
 	if (kvmppc_radix_possible())
 		r = kvmppc_radix_init();
+
+	/*
+	 * POWER9 chips before version 2.02 can't have some threads in
+	 * HPT mode and some in radix mode on the same core.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		unsigned int pvr = mfspr(SPRN_PVR);
+		if ((pvr >> 16) == PVR_POWER9 &&
+		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+			no_mixing_hpt_and_radix = true;
+	}
+
 	return r;
 }
 

From c82b59e7c3f81059b1d280e21028c7ac8451dd52 Mon Sep 17 00:00:00 2001
From: tangwenji <tang.wenji@zte.com.cn>
Date: Mon, 15 Jan 2018 20:09:37 +0800
Subject: [PATCH 179/676] target: fix destroy device in target_configure_device

After dev->transport->configure_device succeeds, target_configure_device
exits abnormally, dev_flags has not set DF_CONFIGURED yet, does not call
destroy_device function in free_device.

Signed-off-by: tangwenji <tang.wenji@zte.com.cn>
Acked-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_device.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index e8dd6da164b28..e27db4d45a9d3 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -997,7 +997,7 @@ int target_configure_device(struct se_device *dev)
 
 	ret = core_setup_alua(dev);
 	if (ret)
-		goto out_free_index;
+		goto out_destroy_device;
 
 	/*
 	 * Startup the struct se_device processing thread
@@ -1041,6 +1041,8 @@ int target_configure_device(struct se_device *dev)
 
 out_free_alua:
 	core_alua_free_lu_gp_mem(dev);
+out_destroy_device:
+	dev->transport->destroy_device(dev);
 out_free_index:
 	mutex_lock(&device_mutex);
 	idr_remove(&devices_idr, dev->dev_index);

From 45dc488c0ee19ba5cba7a67be473aeaf88a7447e Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Mon, 15 Jan 2018 14:37:59 -0600
Subject: [PATCH 180/676] tcmu: fix cmd user after free

If we are failing the command due to a qfull timeout we are
also freeing the tcmu command, so we cannot access it later
to get the se_cmd.

Note: The clearing of cmd->se_cmd is not needed. We do not check
it later for something like determining if the command was failed
due to a timeout. As a result I am dropping it.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 511168bec1593..3096257a00d91 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1152,6 +1152,7 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 		return 0;
 
 	is_running = list_empty(&cmd->cmdr_queue_entry);
+	se_cmd = cmd->se_cmd;
 
 	if (is_running) {
 		/*
@@ -1177,8 +1178,6 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 	pr_debug("Timing out cmd %u on dev %s that is %s.\n",
 		 id, udev->name, is_running ? "inflight" : "queued");
 
-	se_cmd = cmd->se_cmd;
-	cmd->se_cmd = NULL;
 	target_complete_cmd(se_cmd, scsi_status);
 	return 0;
 }

From 85fae482a9dd3560b9dcd35136fb29ccab6fae48 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Tue, 16 Jan 2018 15:34:32 +0000
Subject: [PATCH 181/676] tcmu: Fix trailing semicolon

The trailing semicolon is an empty statement that does no operation.
It is completely stripped out by the compiler. Removing it since it doesn't do
anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Acked-by: Mike Christie <mchristi@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 3096257a00d91..4ad89ea71a701 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1583,7 +1583,7 @@ static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev)
 
 	wake_up_all(&udev->nl_cmd_wq);
 
-	return ret;;
+	return ret;
 }
 
 static int tcmu_netlink_event(struct tcmu_dev *udev, enum tcmu_genl_cmd cmd,

From c424c108233dc422a9a29ee833154006a5bdf9fc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:11 +1100
Subject: [PATCH 182/676] KVM: PPC: Book3S HV: Add more info about XIVE queues
 in debugfs

Add details about enabled queues and escalation interrupts.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_xive.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index bf457843e0321..6cff5bdfd6b7e 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1794,6 +1794,7 @@ static int xive_debug_show(struct seq_file *m, void *private)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		unsigned int i;
 
 		if (!xc)
 			continue;
@@ -1803,6 +1804,33 @@ static int xive_debug_show(struct seq_file *m, void *private)
 			   xc->server_num, xc->cppr, xc->hw_cppr,
 			   xc->mfrr, xc->pending,
 			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+		for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+			struct xive_q *q = &xc->queues[i];
+			u32 i0, i1, idx;
+
+			if (!q->qpage && !xc->esc_virq[i])
+				continue;
+
+			seq_printf(m, " [q%d]: ", i);
+
+			if (q->qpage) {
+				idx = q->idx;
+				i0 = be32_to_cpup(q->qpage + idx);
+				idx = (idx + 1) & q->msk;
+				i1 = be32_to_cpup(q->qpage + idx);
+				seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
+			}
+			if (xc->esc_virq[i]) {
+				struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+				struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+				u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+				seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
+					   (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
+					   (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
+					   xc->esc_virq[i], pq, xd->eoi_page);
+				seq_printf(m, "\n");
+			}
+		}
 
 		t_rm_h_xirr += xc->stat_rm_h_xirr;
 		t_rm_h_ipoll += xc->stat_rm_h_ipoll;

From bf4159da4751ab8eea43ca6e7c49193dbce8398c Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:12 +1100
Subject: [PATCH 183/676] KVM: PPC: Book3S HV: Enable use of the new XIVE
 "single escalation" feature

That feature, provided by Power9 DD2.0 and later, when supported
by newer OPAL versions, allows us to sacrifice a queue (priority 7)
in favor of merging all the escalation interrupts of the queues
of a single VP into a single interrupt.

This reduces the number of host interrupts used up by KVM guests
especially when those guests use multiple priorities.

It will also enable a future change to control the masking of the
escalation interrupts more precisely to avoid spurious ones.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 arch/powerpc/include/asm/xive.h     |  3 +-
 arch/powerpc/kvm/book3s_xive.c      | 48 +++++++++++++++++++----------
 arch/powerpc/kvm/book3s_xive.h      | 15 ++++-----
 arch/powerpc/sysdev/xive/native.c   | 18 +++++++++--
 5 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 233c7504b1f20..fc926743647ec 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1073,6 +1073,7 @@ enum {
 /* Flags for OPAL_XIVE_GET/SET_VP_INFO */
 enum {
 	OPAL_XIVE_VP_ENABLED		= 0x00000001,
+	OPAL_XIVE_VP_SINGLE_ESCALATION	= 0x00000002,
 };
 
 /* "Any chip" replacement for chip ID for allocation functions */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index b619a5585cd68..e602903c3029e 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
 extern void xive_native_sync_source(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
-extern int xive_native_enable_vp(u32 vp_id);
+extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
 extern int xive_native_disable_vp(u32 vp_id);
 extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
+extern bool xive_native_has_single_escalation(void);
 
 #else
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 6cff5bdfd6b7e..a102efeabf05a 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
 		return -EIO;
 	}
 
-	/*
-	 * Future improvement: start with them disabled
-	 * and handle DD2 and later scheme of merged escalation
-	 * interrupts
-	 */
-	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
-			 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (xc->xive->single_escalation)
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num);
+	else
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num, prio);
 	if (!name) {
 		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
 		       prio, xc->server_num);
 		rc = -ENOMEM;
 		goto error;
 	}
+
+	pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
+
 	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
 			 IRQF_NO_THREAD, name, vcpu);
 	if (rc) {
@@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
 
 	pr_devel("Provisioning prio... %d\n", prio);
 
-	/* Provision each VCPU and enable escalations */
+	/* Provision each VCPU and enable escalations if needed */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!vcpu->arch.xive_vcpu)
 			continue;
 		rc = xive_provision_queue(vcpu, prio);
-		if (rc == 0)
+		if (rc == 0 && !xive->single_escalation)
 			xive_attach_escalation(vcpu, prio);
 		if (rc)
 			return rc;
@@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	/* Allocate IPI */
 	xc->vp_ipi = xive_native_alloc_irq();
 	if (!xc->vp_ipi) {
+		pr_err("Failed to allocate xive irq for VCPU IPI\n");
 		r = -EIO;
 		goto bail;
 	}
@@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	if (r)
 		goto bail;
 
+	/*
+	 * Enable the VP first as the single escalation mode will
+	 * affect escalation interrupts numbering
+	 */
+	r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+	if (r) {
+		pr_err("Failed to enable VP in OPAL, err %d\n", r);
+		goto bail;
+	}
+
 	/*
 	 * Initialize queues. Initially we set them all for no queueing
 	 * and we enable escalation for queue 0 only which we'll use for
 	 * our mfrr change notifications. If the VCPU is hot-plugged, we
-	 * do handle provisioning however.
+	 * do handle provisioning however based on the existing "map"
+	 * of enabled queues.
 	 */
 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
 		struct xive_q *q = &xc->queues[i];
 
+		/* Single escalation, no queue 7 */
+		if (i == 7 && xive->single_escalation)
+			break;
+
 		/* Is queue already enabled ? Provision it */
 		if (xive->qmap & (1 << i)) {
 			r = xive_provision_queue(vcpu, i);
-			if (r == 0)
+			if (r == 0 && !xive->single_escalation)
 				xive_attach_escalation(vcpu, i);
 			if (r)
 				goto bail;
@@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	if (r)
 		goto bail;
 
-	/* Enable the VP */
-	r = xive_native_enable_vp(xc->vp_id);
-	if (r)
-		goto bail;
-
 	/* Route the IPI */
 	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
 	if (!r)
@@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 
 	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
 		 val, server, guest_prio);
+
 	/*
 	 * If the source doesn't already have an IPI, allocate
 	 * one and get the corresponding data
@@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
 	if (xive->vp_base == XIVE_INVALID_VP)
 		ret = -ENOMEM;
 
+	xive->single_escalation = xive_native_has_single_escalation();
+
 	if (ret) {
 		kfree(xive);
 		return ret;
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 6ba63f8e8a614..a08ae6fd4c51f 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -120,6 +120,8 @@ struct kvmppc_xive {
 	u32	q_order;
 	u32	q_page_order;
 
+	/* Flags */
+	u8	single_escalation;
 };
 
 #define KVMPPC_XIVE_Q_COUNT	8
@@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
  * is as follow.
  *
  * Guest request for 0...6 are honored. Guest request for anything
- * higher results in a priority of 7 being applied.
- *
- * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
- * in order to match AIX expectations
+ * higher results in a priority of 6 being applied.
  *
  * Similar mapping is done for CPPR values
  */
 static inline u8 xive_prio_from_guest(u8 prio)
 {
-	if (prio == 0xff || prio < 8)
+	if (prio == 0xff || prio < 6)
 		return prio;
-	return 7;
+	return 6;
 }
 
 static inline u8 xive_prio_to_guest(u8 prio)
 {
-	if (prio == 0xff || prio < 7)
-		return prio;
-	return 0xb;
+	return prio;
 }
 
 static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index ebc244b08d674..d22aeb0b69e10 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -42,6 +42,7 @@ static u32 xive_provision_chip_count;
 static u32 xive_queue_shift;
 static u32 xive_pool_vps = XIVE_INVALID_VP;
 static struct kmem_cache *xive_provision_cache;
+static bool xive_has_single_esc;
 
 int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
 {
@@ -571,6 +572,10 @@ bool __init xive_native_init(void)
 			break;
 	}
 
+	/* Do we support single escalation */
+	if (of_get_property(np, "single-escalation-support", NULL) != NULL)
+		xive_has_single_esc = true;
+
 	/* Configure Thread Management areas for KVM */
 	for_each_possible_cpu(cpu)
 		kvmppc_set_xive_tima(cpu, r.start, tima);
@@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base)
 }
 EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
 
-int xive_native_enable_vp(u32 vp_id)
+int xive_native_enable_vp(u32 vp_id, bool single_escalation)
 {
 	s64 rc;
+	u64 flags = OPAL_XIVE_VP_ENABLED;
 
+	if (single_escalation)
+		flags |= OPAL_XIVE_VP_SINGLE_ESCALATION;
 	for (;;) {
-		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
+		rc = opal_xive_set_vp_info(vp_id, flags, 0);
 		if (rc != OPAL_BUSY)
 			break;
 		msleep(1);
@@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
+
+bool xive_native_has_single_escalation(void)
+{
+	return xive_has_single_esc;
+}
+EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);

From 2267ea7661798a42f0da648a2970e2a03f4bc370 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:13 +1100
Subject: [PATCH 184/676] KVM: PPC: Book3S HV: Don't use existing "prodded"
 flag for XIVE escalations

The prodded flag is only cleared at the beginning of H_CEDE,
so every time we have an escalation, we will cause the *next*
H_CEDE to return immediately.

Instead use a dedicated "irq_pending" flag to indicate that
a guest interrupt is pending for the VCPU. We don't reuse the
existing exception bitmap so as to avoid expensive atomic ops.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kernel/asm-offsets.c       |  1 +
 arch/powerpc/kvm/book3s_hv.c            |  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++
 arch/powerpc/kvm/book3s_xive.c          |  3 +--
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3aa5b577cd609..bfe51356af5e1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -709,6 +709,7 @@ struct kvm_vcpu_arch {
 	u8 ceded;
 	u8 prodded;
 	u8 doorbell_request;
+	u8 irq_pending; /* Used by XIVE to signal pending guest irqs */
 	u32 last_inst;
 
 	struct swait_queue_head *wqp;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6b958414b4e03..825089cf3e235 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -514,6 +514,7 @@ int main(void)
 	OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
 	OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
 	OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+	OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
 	OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
 	OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
 	OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 76cf48051eb3e..e5f81fc108e09 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2999,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
 {
 	if (!xive_enabled())
 		return false;
-	return vcpu->arch.xive_saved_state.pipr <
+	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
 		vcpu->arch.xive_saved_state.cppr;
 }
 #else
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7daf21be33d0b..34dbab7deb39e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1035,6 +1035,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	li	r9, 1
 	stw	r9, VCPU_XIVE_PUSHED(r4)
 	eieio
+
+	/*
+	 * We clear the irq_pending flag. There is a small chance of a
+	 * race vs. the escalation interrupt happening on another
+	 * processor setting it again, but the only consequence is to
+	 * cause a spurrious wakeup on the next H_CEDE which is not an
+	 * issue.
+	 */
+	li	r0,0
+	stb	r0, VCPU_IRQ_PENDING(r4)
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index a102efeabf05a..eef9ccafdc098 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -84,8 +84,7 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
 {
 	struct kvm_vcpu *vcpu = data;
 
-	/* We use the existing H_PROD mechanism to wake up the target */
-	vcpu->arch.prodded = 1;
+	vcpu->arch.irq_pending = 1;
 	smp_mb();
 	if (vcpu->arch.ceded)
 		kvmppc_fast_vcpu_kick(vcpu);

From 2662efd050953824de5c9b24449d6b5b342db10b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:14 +1100
Subject: [PATCH 185/676] KVM: PPC: Book3S HV: Check DR not IR to chose real vs
 virt mode MMIOs

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 34dbab7deb39e..948f21cf84d56 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1464,7 +1464,7 @@ mc_cont:
 	li	r7, TM_SPC_PULL_OS_CTX
 	li	r6, TM_QW1_OS
 	mfmsr	r0
-	andi.	r0, r0, MSR_IR		/* in real mode? */
+	andi.	r0, r0, MSR_DR		/* in real mode? */
 	beq	2f
 	ld	r10, HSTATE_XIVE_TIMA_VIRT(r13)
 	cmpldi	cr0, r10, 0

From 35c2405efc0142860c4b698f4c6331567c4ca1ef Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:15 +1100
Subject: [PATCH 186/676] KVM: PPC: Book3S HV: Make xive_pushed a byte, not a
 word

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     | 2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bfe51356af5e1..0c44fa67608d9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -739,7 +739,7 @@ struct kvm_vcpu_arch {
 	struct kvmppc_icp *icp; /* XICS presentation controller */
 	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
 	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
-	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
+	u8 xive_pushed;		 /* Is the VP pushed on the physical CPU ? */
 	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 948f21cf84d56..a7f429bc6de07 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1033,7 +1033,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	li	r9, TM_QW1_OS + TM_WORD2
 	stwcix	r11,r9,r10
 	li	r9, 1
-	stw	r9, VCPU_XIVE_PUSHED(r4)
+	stb	r9, VCPU_XIVE_PUSHED(r4)
 	eieio
 
 	/*
@@ -1458,7 +1458,7 @@ mc_cont:
 #endif
 #ifdef CONFIG_KVM_XICS
 	/* We are exiting, pull the VP from the XIVE */
-	lwz	r0, VCPU_XIVE_PUSHED(r9)
+	lbz	r0, VCPU_XIVE_PUSHED(r9)
 	cmpwi	cr0, r0, 0
 	beq	1f
 	li	r7, TM_SPC_PULL_OS_CTX
@@ -1487,7 +1487,7 @@ mc_cont:
 	/* Fixup some of the state for the next load */
 	li	r10, 0
 	li	r0, 0xff
-	stw	r10, VCPU_XIVE_PUSHED(r9)
+	stb	r10, VCPU_XIVE_PUSHED(r9)
 	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
 	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
 	eieio

From 9b9b13a6d1537ddc4caccd6f1c41b78edbc08437 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:16 +1100
Subject: [PATCH 187/676] KVM: PPC: Book3S HV: Keep XIVE escalation interrupt
 masked unless ceded

This works on top of the single escalation support. When in single
escalation, with this change, we will keep the escalation interrupt
disabled unless the VCPU is in H_CEDE (idle). In any other case, we
know the VCPU will be rescheduled and thus there is no need to take
escalation interrupts in the host whenever a guest interrupt fires.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  3 ++
 arch/powerpc/kernel/asm-offsets.c       |  3 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 62 ++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_xive.c          | 30 ++++++++++++
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0c44fa67608d9..fef8133becc85 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -740,7 +740,10 @@ struct kvm_vcpu_arch {
 	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
 	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
 	u8 xive_pushed;		 /* Is the VP pushed on the physical CPU ? */
+	u8 xive_esc_on;		 /* Is the escalation irq enabled ? */
 	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
+	u64 xive_esc_raddr;	 /* Escalation interrupt ESB real addr */
+	u64 xive_esc_vaddr;	 /* Escalation interrupt ESB virt addr */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 825089cf3e235..1672dffd94e2e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -734,6 +734,9 @@ int main(void)
 	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
 					    arch.xive_cam_word));
 	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+	DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
+	DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
+	DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
 #endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index a7f429bc6de07..a7a20b85d8eb8 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1045,6 +1045,41 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	 */
 	li	r0,0
 	stb	r0, VCPU_IRQ_PENDING(r4)
+
+	/*
+	 * In single escalation mode, if the escalation interrupt is
+	 * on, we mask it.
+	 */
+	lbz	r0, VCPU_XIVE_ESC_ON(r4)
+	cmpwi	r0,0
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_RADDR(r4)
+	li	r9, XIVE_ESB_SET_PQ_01
+	ldcix	r0, r10, r9
+	sync
+
+	/* We have a possible subtle race here: The escalation interrupt might
+	 * have fired and be on its way to the host queue while we mask it,
+	 * and if we unmask it early enough (re-cede right away), there is
+	 * a theorical possibility that it fires again, thus landing in the
+	 * target queue more than once which is a big no-no.
+	 *
+	 * Fortunately, solving this is rather easy. If the above load setting
+	 * PQ to 01 returns a previous value where P is set, then we know the
+	 * escalation interrupt is somewhere on its way to the host. In that
+	 * case we simply don't clear the xive_esc_on flag below. It will be
+	 * eventually cleared by the handler for the escalation interrupt.
+	 *
+	 * Then, when doing a cede, we check that flag again before re-enabling
+	 * the escalation interrupt, and if set, we abort the cede.
+	 */
+	andi.	r0, r0, XIVE_ESB_VAL_P
+	bne-	1f
+
+	/* Now P is 0, we can clear the flag */
+	li	r0, 0
+	stb	r0, VCPU_XIVE_ESC_ON(r4)
+1:
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
@@ -2756,7 +2791,32 @@ kvm_cede_prodded:
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
 	ld	r9, HSTATE_KVM_VCPU(r13)
-	b	guest_exit_cont
+#ifdef CONFIG_KVM_XICS
+	/* Abort if we still have a pending escalation */
+	lbz	r5, VCPU_XIVE_ESC_ON(r9)
+	cmpwi	r5, 0
+	beq	1f
+	li	r0, 0
+	stb	r0, VCPU_CEDED(r9)
+1:	/* Enable XIVE escalation */
+	li	r5, XIVE_ESB_SET_PQ_00
+	mfmsr	r0
+	andi.	r0, r0, MSR_DR		/* in real mode? */
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_VADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldx	r0, r10, r5
+	b	2f
+1:	ld	r10, VCPU_XIVE_ESC_RADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldcix	r0, r10, r5
+2:	sync
+	li	r0, 1
+	stb	r0, VCPU_XIVE_ESC_ON(r9)
+#endif /* CONFIG_KVM_XICS */
+3:	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index eef9ccafdc098..7a047bc88f110 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
 	if (vcpu->arch.ceded)
 		kvmppc_fast_vcpu_kick(vcpu);
 
+	/* Since we have the no-EOI flag, the interrupt is effectively
+	 * disabled now. Clearing xive_esc_on means we won't bother
+	 * doing so on the next entry.
+	 *
+	 * This also allows the entry code to know that if a PQ combination
+	 * of 10 is observed while xive_esc_on is true, it means the queue
+	 * contains an unprocessed escalation interrupt. We don't make use of
+	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
+	 */
+	vcpu->arch.xive_esc_on = false;
+
 	return IRQ_HANDLED;
 }
 
@@ -134,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
 		goto error;
 	}
 	xc->esc_virq_names[prio] = name;
+
+	/* In single escalation mode, we grab the ESB MMIO of the
+	 * interrupt and mask it. Also populate the VCPU v/raddr
+	 * of the ESB page for use by asm entry/exit code. Finally
+	 * set the XIVE_IRQ_NO_EOI flag which will prevent the
+	 * core code from performing an EOI on the escalation
+	 * interrupt, thus leaving it effectively masked after
+	 * it fires once.
+	 */
+	if (xc->xive->single_escalation) {
+		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
+		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+		vcpu->arch.xive_esc_raddr = xd->eoi_page;
+		vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
+		xd->flags |= XIVE_IRQ_NO_EOI;
+	}
+
 	return 0;
 error:
 	irq_dispose_mapping(xc->esc_virq[prio]);

From ba87977a49913129962af8ac35b0e13e0fa4382d Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Fri, 19 Jan 2018 09:18:54 +0100
Subject: [PATCH 188/676] kernfs: fix regression in kernfs_fop_write caused by
 wrong type

Commit b7ce40cff0b9 ("kernfs: cache atomic_write_len in
kernfs_open_file") changes type of local variable 'len' from ssize_t
to size_t. This change caused that the *ppos value is updated also
when the previous write callback failed.

Mentioned snippet:
...
len = ops->write(...); <- return value can be negative
...
if (len > 0)           <- true here in this case
        *ppos += len;
...

Fixes: b7ce40cff0b9 ("kernfs: cache atomic_write_len in kernfs_open_file")
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/kernfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9698e51656b10..d8f49c412f508 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -275,7 +275,7 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 {
 	struct kernfs_open_file *of = kernfs_of(file);
 	const struct kernfs_ops *ops;
-	size_t len;
+	ssize_t len;
 	char *buf;
 
 	if (of->atomic_write_len) {

From cf5eebae2cd28d37581507668605f4d23cd7218d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 15 Nov 2017 11:34:58 +0100
Subject: [PATCH 189/676] seq_file: fix incomplete reset on read from zero
 offset

When resetting iterator on a zero offset we need to discard any data
already in the buffer (count), and private state of the iterator (version).

For example this bug results in first line being repeated in /proc/mounts
if doing a zero size read before a non-zero size read.

Reported-by: Rich Felker <dalias@libc.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Fixes: e522751d605d ("seq_file: reset iterator to first record for zero offset")
Cc: <stable@vger.kernel.org> # v4.10
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/seq_file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 4be761c1a03d1..eea09f6d88305 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -181,8 +181,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	 * if request is to read from zero offset, reset iterator to first
 	 * record as it might have been already advanced by previous requests
 	 */
-	if (*ppos == 0)
+	if (*ppos == 0) {
 		m->index = 0;
+		m->version = 0;
+		m->count = 0;
+	}
 
 	/* Don't assume *ppos is where we left it */
 	if (unlikely(*ppos != m->read_pos)) {

From c0b4bd219124fdb7306fb67b571aff4c56a5e8f9 Mon Sep 17 00:00:00 2001
From: Janosch Frank <frankja@linux.vnet.ibm.com>
Date: Wed, 13 Dec 2017 13:53:22 +0100
Subject: [PATCH 190/676] s390/mm: Remove superfluous parameter

It seems it hasn't even been used before the last cleanup and was
overlooked.

Signed-off-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Message-Id: <1513169613-13509-12-git-send-email-frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 54cfd51a5a27e..4e9d12af95cc9 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1998,7 +1998,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_page);
  * Called with sg->parent->shadow_lock.
  */
 static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
-			       unsigned long gaddr, pte_t *pte)
+			       unsigned long gaddr)
 {
 	struct gmap_rmap *rmap, *rnext, *head;
 	unsigned long start, end, bits, raddr;
@@ -2083,7 +2083,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
 			spin_lock(&gmap->shadow_lock);
 			list_for_each_entry_safe(sg, next,
 						 &gmap->children, list)
-				gmap_shadow_notify(sg, vmaddr, gaddr, pte);
+				gmap_shadow_notify(sg, vmaddr, gaddr);
 			spin_unlock(&gmap->shadow_lock);
 		}
 		if (bits & PGSTE_IN_BIT)

From 58d6b15e9da5042a99c9c30ad725792e4569150e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 22 Jan 2018 18:19:06 +0000
Subject: [PATCH 191/676] KVM: arm/arm64: Handle CPU_PM_ENTER_FAILED

cpu_pm_enter() calls the pm notifier chain with CPU_PM_ENTER, then if
there is a failure: CPU_PM_ENTER_FAILED.

When KVM receives CPU_PM_ENTER it calls cpu_hyp_reset() which will
return us to the hyp-stub. If we subsequently get a CPU_PM_ENTER_FAILED,
KVM does nothing, leaving the CPU running with the hyp-stub, at odds
with kvm_arm_hardware_enabled.

Add CPU_PM_ENTER_FAILED as a fallthrough for CPU_PM_EXIT, this reloads
KVM based on kvm_arm_hardware_enabled. This is safe even if CPU_PM_ENTER
never gets as far as KVM, as cpu_hyp_reinit() calls cpu_hyp_reset()
to make sure the hyp-stub is loaded before reloading KVM.

Fixes: 67f691976662 ("arm64: kvm: allows kvm cpu hotplug")
Cc: <stable@vger.kernel.org> # v4.7+
CC: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 59d8e04c19fa2..639dca0c05609 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -1262,6 +1262,7 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
 			cpu_hyp_reset();
 
 		return NOTIFY_OK;
+	case CPU_PM_ENTER_FAILED:
 	case CPU_PM_EXIT:
 		if (__this_cpu_read(kvm_arm_hardware_enabled))
 			/* The hardware was enabled before suspend. */

From b276f1b3b1eb52697f6645bb98f7d32ffb89df69 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt <luisbg@kernel.org>
Date: Tue, 23 Jan 2018 15:11:14 +0000
Subject: [PATCH 192/676] KVM: arm/arm64: Fix trailing semicolon

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_emulate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index 3d22eb87f919a..9003bd19cb701 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -131,7 +131,7 @@ static inline bool mode_has_spsr(struct kvm_vcpu *vcpu)
 static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
 {
 	unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK;
-	return cpsr_mode > USR_MODE;;
+	return cpsr_mode > USR_MODE;
 }
 
 static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu)

From 1c130ae00b769a2e2df41bad3d6051ee8234b636 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 19 Jan 2018 14:36:29 +0100
Subject: [PATCH 193/676] iscsi-target: make sure to wake up sleeping login
 worker

Mike Christie reports:
  Starting in 4.14 iscsi logins will fail around 50% of the time.

Problem appears to be that iscsi_target_sk_data_ready() callback may
return without doing anything in case it finds the login work queue
is still blocked in sock_recvmsg().

Nicholas Bellinger says:
  It would indicate users providing their own ->sk_data_ready() callback
  must be responsible for waking up a kthread context blocked on
  sock_recvmsg(..., MSG_WAITALL), when a second ->sk_data_ready() is
  received before the first sock_recvmsg(..., MSG_WAITALL) completes.

So, do this and invoke the original data_ready() callback -- in
case of tcp sockets this takes care of waking the thread.

Disclaimer: I do not understand why this problem did not show up before
tcp prequeue removal.

(Drop WARN_ON usage - nab)

Reported-by: Mike Christie <mchristi@redhat.com>
Bisected-by: Mike Christie <mchristi@redhat.com>
Tested-by: Mike Christie <mchristi@redhat.com>
Diagnosed-by: Nicholas Bellinger <nab@linux-iscsi.org>
Fixes: e7942d0633c4 ("tcp: remove prequeue support")
Signed-off-by: Florian Westphal <fw@strlen.de>
Cc: stable@vger.kernel.org # 4.14+
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/iscsi/iscsi_target_nego.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c
index b686e2ce9c0e5..8a5e8d17a9426 100644
--- a/drivers/target/iscsi/iscsi_target_nego.c
+++ b/drivers/target/iscsi/iscsi_target_nego.c
@@ -432,6 +432,9 @@ static void iscsi_target_sk_data_ready(struct sock *sk)
 	if (test_and_set_bit(LOGIN_FLAGS_READ_ACTIVE, &conn->login_flags)) {
 		write_unlock_bh(&sk->sk_callback_lock);
 		pr_debug("Got LOGIN_FLAGS_READ_ACTIVE=1, conn: %p >>>>\n", conn);
+		if (iscsi_target_sk_data_ready == conn->orig_data_ready)
+			return;
+		conn->orig_data_ready(sk);
 		return;
 	}
 

From b3ecd4aa8632a86428605ab73393d14779019d82 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 16 Jan 2018 18:15:25 +0100
Subject: [PATCH 194/676] KVM: s390: vsie: use READ_ONCE to access some SCB
 fields

Another VCPU might try to modify the SCB while we are creating the
shadow SCB. In general this is no problem - unless the compiler decides
to not load values once, but e.g. twice.

For us, this is only relevant when checking/working with such values.
E.g. the prefix value, the mso, state of transactional execution and
addresses of satellite blocks.

E.g. if we blindly forward values (e.g. general purpose registers or
execution controls after masking), we don't care.

Leaving unpin_blocks() untouched for now, will handle it separately.

The worst thing right now that I can see would be a missed prefix
un/remap (mso, prefix, tx) or using wrong guest addresses. Nothing
critical, but let's try to avoid unpredictable behavior.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180116171526.12343-2-david@redhat.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 50 +++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 5d6ae0326d9e8..b2066fbb6341f 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -28,7 +28,11 @@ struct vsie_page {
 	 * the same offset as that in struct sie_page!
 	 */
 	struct mcck_volatile_info mcck_info;    /* 0x0200 */
-	/* the pinned originial scb */
+	/*
+	 * The pinned original scb. Be aware that other VCPUs can modify
+	 * it while we read from it. Values that are used for conditions or
+	 * are reused conditionally, should be accessed via READ_ONCE.
+	 */
 	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
 	/* the shadow gmap in use by the vsie_page */
 	struct gmap *gmap;			/* 0x0220 */
@@ -140,12 +144,13 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
-	u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+	const uint32_t crycbd_o = READ_ONCE(scb_o->crycbd);
+	const u32 crycb_addr = crycbd_o & 0x7ffffff8U;
 	unsigned long *b1, *b2;
 	u8 ecb3_flags;
 
 	scb_s->crycbd = 0;
-	if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+	if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
 		return 0;
 	/* format-1 is supported with message-security-assist extension 3 */
 	if (!test_kvm_facility(vcpu->kvm, 76))
@@ -183,12 +188,15 @@ static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+	/* READ_ONCE does not work on bitfields - use a temporary variable */
+	const uint32_t __new_ibc = scb_o->ibc;
+	const uint32_t new_ibc = READ_ONCE(__new_ibc) & 0x0fffU;
 	__u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
 
 	scb_s->ibc = 0;
 	/* ibc installed in g2 and requested for g3 */
-	if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
-		scb_s->ibc = scb_o->ibc & 0x0fffU;
+	if (vcpu->kvm->arch.model.ibc && new_ibc) {
+		scb_s->ibc = new_ibc;
 		/* takte care of the minimum ibc level of the machine */
 		if (scb_s->ibc < min_ibc)
 			scb_s->ibc = min_ibc;
@@ -253,6 +261,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	/* READ_ONCE does not work on bitfields - use a temporary variable */
+	const uint32_t __new_prefix = scb_o->prefix;
+	const uint32_t new_prefix = READ_ONCE(__new_prefix);
+	const bool wants_tx = READ_ONCE(scb_o->ecb) & ECB_TE;
 	bool had_tx = scb_s->ecb & ECB_TE;
 	unsigned long new_mso = 0;
 	int rc;
@@ -299,14 +311,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	scb_s->icpua = scb_o->icpua;
 
 	if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
-		new_mso = scb_o->mso & 0xfffffffffff00000UL;
+		new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
 	/* if the hva of the prefix changes, we have to remap the prefix */
-	if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+	if (scb_s->mso != new_mso || scb_s->prefix != new_prefix)
 		prefix_unmapped(vsie_page);
 	 /* SIE will do mso/msl validity and exception checks for us */
 	scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
 	scb_s->mso = new_mso;
-	scb_s->prefix = scb_o->prefix;
+	scb_s->prefix = new_prefix;
 
 	/* We have to definetly flush the tlb if this scb never ran */
 	if (scb_s->ihcpu != 0xffffU)
@@ -318,11 +330,11 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
 		scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
 	/* transactional execution */
-	if (test_kvm_facility(vcpu->kvm, 73)) {
+	if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
 		/* remap the prefix is tx is toggled on */
-		if ((scb_o->ecb & ECB_TE) && !had_tx)
+		if (!had_tx)
 			prefix_unmapped(vsie_page);
-		scb_s->ecb |= scb_o->ecb & ECB_TE;
+		scb_s->ecb |= ECB_TE;
 	}
 	/* SIMD */
 	if (test_kvm_facility(vcpu->kvm, 129)) {
@@ -529,9 +541,9 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	gpa_t gpa;
 	int rc = 0;
 
-	gpa = scb_o->scaol & ~0xfUL;
+	gpa = READ_ONCE(scb_o->scaol) & ~0xfUL;
 	if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
-		gpa |= (u64) scb_o->scaoh << 32;
+		gpa |= (u64) READ_ONCE(scb_o->scaoh) << 32;
 	if (gpa) {
 		if (!(gpa & ~0x1fffUL))
 			rc = set_validity_icpt(scb_s, 0x0038U);
@@ -551,7 +563,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->scaol = (u32)(u64)hpa;
 	}
 
-	gpa = scb_o->itdba & ~0xffUL;
+	gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
 	if (gpa && (scb_s->ecb & ECB_TE)) {
 		if (!(gpa & ~0x1fffU)) {
 			rc = set_validity_icpt(scb_s, 0x0080U);
@@ -566,7 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->itdba = hpa;
 	}
 
-	gpa = scb_o->gvrd & ~0x1ffUL;
+	gpa = READ_ONCE(scb_o->gvrd) & ~0x1ffUL;
 	if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x1310U);
@@ -584,7 +596,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		scb_s->gvrd = hpa;
 	}
 
-	gpa = scb_o->riccbd & ~0x3fUL;
+	gpa = READ_ONCE(scb_o->riccbd) & ~0x3fUL;
 	if (gpa && (scb_s->ecb3 & ECB3_RI)) {
 		if (!(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x0043U);
@@ -602,8 +614,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
 		unsigned long sdnxc;
 
-		gpa = scb_o->sdnxo & ~0xfUL;
-		sdnxc = scb_o->sdnxo & 0xfUL;
+		gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
+		sdnxc = READ_ONCE(scb_o->sdnxo) & 0xfUL;
 		if (!gpa || !(gpa & ~0x1fffUL)) {
 			rc = set_validity_icpt(scb_s, 0x10b0U);
 			goto unpin;
@@ -768,7 +780,7 @@ static void retry_vsie_icpt(struct vsie_page *vsie_page)
 static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	__u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+	__u32 fac = READ_ONCE(vsie_page->scb_o->fac) & 0x7ffffff8U;
 
 	if (fac && test_kvm_facility(vcpu->kvm, 7)) {
 		retry_vsie_icpt(vsie_page);

From 15e5020e575d0c1a4eddd99bf7ffdc1f34a3b17d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 16 Jan 2018 18:15:26 +0100
Subject: [PATCH 195/676] KVM: s390: vsie: store guest addresses of satellite
 blocks in vsie_page

This way, the values cannot change, even if another VCPU might try to
mess with the nested SCB currently getting executed by another VCPU.

We now always use the same gpa for pinning and unpinning a page (for
unpinning, it is only relevant to mark the guest page dirty for
migration).

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180116171526.12343-3-david@redhat.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/vsie.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index b2066fbb6341f..6090f4235cfc0 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -38,7 +38,13 @@ struct vsie_page {
 	struct gmap *gmap;			/* 0x0220 */
 	/* address of the last reported fault to guest2 */
 	unsigned long fault_addr;		/* 0x0228 */
-	__u8 reserved[0x0700 - 0x0230];		/* 0x0230 */
+	/* calculated guest addresses of satellite control blocks */
+	gpa_t sca_gpa;				/* 0x0230 */
+	gpa_t itdba_gpa;			/* 0x0238 */
+	gpa_t gvrd_gpa;				/* 0x0240 */
+	gpa_t riccbd_gpa;			/* 0x0248 */
+	gpa_t sdnx_gpa;				/* 0x0250 */
+	__u8 reserved[0x0700 - 0x0258];		/* 0x0258 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 };
@@ -475,46 +481,42 @@ static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
 /* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
 static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
-	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	hpa_t hpa;
-	gpa_t gpa;
 
 	hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
 	if (hpa) {
-		gpa = scb_o->scaol & ~0xfUL;
-		if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
-			gpa |= (u64) scb_o->scaoh << 32;
-		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa);
+		vsie_page->sca_gpa = 0;
 		scb_s->scaol = 0;
 		scb_s->scaoh = 0;
 	}
 
 	hpa = scb_s->itdba;
 	if (hpa) {
-		gpa = scb_o->itdba & ~0xffUL;
-		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa);
+		vsie_page->itdba_gpa = 0;
 		scb_s->itdba = 0;
 	}
 
 	hpa = scb_s->gvrd;
 	if (hpa) {
-		gpa = scb_o->gvrd & ~0x1ffUL;
-		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		unpin_guest_page(vcpu->kvm, vsie_page->gvrd_gpa, hpa);
+		vsie_page->gvrd_gpa = 0;
 		scb_s->gvrd = 0;
 	}
 
 	hpa = scb_s->riccbd;
 	if (hpa) {
-		gpa = scb_o->riccbd & ~0x3fUL;
-		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		unpin_guest_page(vcpu->kvm, vsie_page->riccbd_gpa, hpa);
+		vsie_page->riccbd_gpa = 0;
 		scb_s->riccbd = 0;
 	}
 
 	hpa = scb_s->sdnxo;
 	if (hpa) {
-		gpa = scb_o->sdnxo;
-		unpin_guest_page(vcpu->kvm, gpa, hpa);
+		unpin_guest_page(vcpu->kvm, vsie_page->sdnx_gpa, hpa);
+		vsie_page->sdnx_gpa = 0;
 		scb_s->sdnxo = 0;
 	}
 }
@@ -559,6 +561,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		}
 		if (rc)
 			goto unpin;
+		vsie_page->sca_gpa = gpa;
 		scb_s->scaoh = (u32)((u64)hpa >> 32);
 		scb_s->scaol = (u32)(u64)hpa;
 	}
@@ -575,6 +578,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = set_validity_icpt(scb_s, 0x0080U);
 			goto unpin;
 		}
+		vsie_page->itdba_gpa = gpa;
 		scb_s->itdba = hpa;
 	}
 
@@ -593,6 +597,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = set_validity_icpt(scb_s, 0x1310U);
 			goto unpin;
 		}
+		vsie_page->gvrd_gpa = gpa;
 		scb_s->gvrd = hpa;
 	}
 
@@ -609,6 +614,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			goto unpin;
 		}
 		/* Validity 0x0044 will be checked by SIE */
+		vsie_page->riccbd_gpa = gpa;
 		scb_s->riccbd = hpa;
 	}
 	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
@@ -636,6 +642,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			rc = set_validity_icpt(scb_s, 0x10b0U);
 			goto unpin;
 		}
+		vsie_page->sdnx_gpa = gpa;
 		scb_s->sdnxo = hpa | sdnxc;
 	}
 	return 0;

From 5c528db0df6999cc7bf2151b0f9fb6446109ec54 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 23 Jan 2018 22:26:18 +0100
Subject: [PATCH 196/676] s390x/mm: simplify gmap_protect_rmap()

We never call it with anything but PROT_READ. This is a left over from
an old prototype. For creation of shadow page tables, we always only
have to protect the original table in guest memory from write accesses,
so we can properly invalidate the shadow on writes. Other protections
are not needed.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180123212618.32611-1-david@redhat.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/mm/gmap.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 4e9d12af95cc9..2c55a2b9d6c65 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1021,18 +1021,17 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
 }
 
 /**
- * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
  * @sg: pointer to the shadow guest address space structure
  * @raddr: rmap address in the shadow gmap
  * @paddr: address in the parent guest address space
  * @len: length of the memory area to protect
- * @prot: indicates access rights: none, read-only or read-write
  *
  * Returns 0 if successfully protected and the rmap was created, -ENOMEM
  * if out of memory and -EFAULT if paddr is invalid.
  */
 static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
-			     unsigned long paddr, unsigned long len, int prot)
+			     unsigned long paddr, unsigned long len)
 {
 	struct gmap *parent;
 	struct gmap_rmap *rmap;
@@ -1060,7 +1059,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 		ptep = gmap_pte_op_walk(parent, paddr, &ptl);
 		if (ptep) {
 			spin_lock(&sg->guest_table_lock);
-			rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+			rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
 					     PGSTE_VSIE_BIT);
 			if (!rc)
 				gmap_insert_rmap(sg, vmaddr, rmap);
@@ -1070,7 +1069,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
 		radix_tree_preload_end();
 		if (rc) {
 			kfree(rmap);
-			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+			rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
 			if (rc)
 				return rc;
 			continue;
@@ -1609,7 +1608,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
 	origin = r2t & _REGION_ENTRY_ORIGIN;
 	offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
 	len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 4);
@@ -1692,7 +1691,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
 	origin = r3t & _REGION_ENTRY_ORIGIN;
 	offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
 	len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 3);
@@ -1776,7 +1775,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
 	origin = sgt & _REGION_ENTRY_ORIGIN;
 	offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
 	len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
-	rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+	rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 2);
@@ -1895,7 +1894,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
 	/* Make pgt read-only in parent gmap page table (not the pgste) */
 	raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
 	origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
-	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+	rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
 	spin_lock(&sg->guest_table_lock);
 	if (!rc) {
 		table = gmap_table_walk(sg, saddr, 1);

From 866c138c32547b690e4cab3cd209c763508a95ab Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 24 Jan 2018 12:27:01 +0100
Subject: [PATCH 197/676] KVM: s390: diagnoses are instructions as well

Make the diagnose counters also appear as instruction counters.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index cc5a1b4d74b7d..ce8ea8b94478e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -118,12 +118,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_sigp_cpu_reset", VCPU_STAT(instruction_sigp_cpu_reset) },
 	{ "instruction_sigp_init_cpu_reset", VCPU_STAT(instruction_sigp_init_cpu_reset) },
 	{ "instruction_sigp_unknown", VCPU_STAT(instruction_sigp_unknown) },
-	{ "diagnose_10", VCPU_STAT(diagnose_10) },
-	{ "diagnose_44", VCPU_STAT(diagnose_44) },
-	{ "diagnose_9c", VCPU_STAT(diagnose_9c) },
-	{ "diagnose_258", VCPU_STAT(diagnose_258) },
-	{ "diagnose_308", VCPU_STAT(diagnose_308) },
-	{ "diagnose_500", VCPU_STAT(diagnose_500) },
+	{ "instruction_diag_10", VCPU_STAT(diagnose_10) },
+	{ "instruction_diag_44", VCPU_STAT(diagnose_44) },
+	{ "instruction_diag_9c", VCPU_STAT(diagnose_9c) },
+	{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
+	{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
+	{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
 	{ NULL }
 };
 

From a37cb07a30f0a181bc45c6970e486ac2992e9cde Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 23 Jan 2018 13:28:40 +0100
Subject: [PATCH 198/676] KVM: s390: add vcpu stat counters for many
 instruction

The overall instruction counter is larger than the sum of the
single counters. We should try to catch all instruction handlers
to make this match the summary counter.
Let us add sck,tb,sske,iske,rrbe,tb,tpi,tsch,lpsw,pswe....
and remove other unused ones.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/include/asm/kvm_host.h | 21 +++++++++++++++++----
 arch/s390/kvm/diag.c             |  1 +
 arch/s390/kvm/kvm-s390.c         | 21 +++++++++++++++++----
 arch/s390/kvm/priv.c             | 32 ++++++++++++++++++++++++++++++--
 4 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index e16a9f2a44ad6..029e8144c6ecc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -2,7 +2,7 @@
 /*
  * definition for kernel virtual machines on s390
  *
- * Copyright IBM Corp. 2008, 2009
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  */
@@ -316,18 +316,30 @@ struct kvm_vcpu_stat {
 	u64 deliver_program_int;
 	u64 deliver_io_int;
 	u64 exit_wait_state;
+	u64 instruction_epsw;
+	u64 instruction_gs;
+	u64 instruction_io_other;
+	u64 instruction_lpsw;
+	u64 instruction_lpswe;
 	u64 instruction_pfmf;
+	u64 instruction_ptff;
+	u64 instruction_sck;
+	u64 instruction_sckpf;
 	u64 instruction_stidp;
 	u64 instruction_spx;
 	u64 instruction_stpx;
 	u64 instruction_stap;
-	u64 instruction_storage_key;
+	u64 instruction_iske;
+	u64 instruction_ri;
+	u64 instruction_rrbe;
+	u64 instruction_sske;
 	u64 instruction_ipte_interlock;
-	u64 instruction_stsch;
-	u64 instruction_chsc;
 	u64 instruction_stsi;
 	u64 instruction_stfl;
+	u64 instruction_tb;
+	u64 instruction_tpi;
 	u64 instruction_tprot;
+	u64 instruction_tsch;
 	u64 instruction_sie;
 	u64 instruction_essa;
 	u64 instruction_sthyi;
@@ -353,6 +365,7 @@ struct kvm_vcpu_stat {
 	u64 diagnose_258;
 	u64 diagnose_308;
 	u64 diagnose_500;
+	u64 diagnose_other;
 };
 
 #define PGM_OPERATION			0x01
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 89aa114a2cbad..45634b3d2e0ae 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -257,6 +257,7 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 	case 0x500:
 		return __diag_virtio_hypercall(vcpu);
 	default:
+		vcpu->stat.diagnose_other++;
 		return -EOPNOTSUPP;
 	}
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ce8ea8b94478e..8a3a1d50ef99d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2,7 +2,7 @@
 /*
  * hosting IBM Z kernel virtual machines (s390x)
  *
- * Copyright IBM Corp. 2008, 2017
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -87,19 +87,31 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) },
 	{ "deliver_program_interruption", VCPU_STAT(deliver_program_int) },
 	{ "exit_wait_state", VCPU_STAT(exit_wait_state) },
+	{ "instruction_epsw", VCPU_STAT(instruction_epsw) },
+	{ "instruction_gs", VCPU_STAT(instruction_gs) },
+	{ "instruction_io_other", VCPU_STAT(instruction_io_other) },
+	{ "instruction_lpsw", VCPU_STAT(instruction_lpsw) },
+	{ "instruction_lpswe", VCPU_STAT(instruction_lpswe) },
 	{ "instruction_pfmf", VCPU_STAT(instruction_pfmf) },
+	{ "instruction_ptff", VCPU_STAT(instruction_ptff) },
 	{ "instruction_stidp", VCPU_STAT(instruction_stidp) },
+	{ "instruction_sck", VCPU_STAT(instruction_sck) },
+	{ "instruction_sckpf", VCPU_STAT(instruction_sckpf) },
 	{ "instruction_spx", VCPU_STAT(instruction_spx) },
 	{ "instruction_stpx", VCPU_STAT(instruction_stpx) },
 	{ "instruction_stap", VCPU_STAT(instruction_stap) },
-	{ "instruction_storage_key", VCPU_STAT(instruction_storage_key) },
+	{ "instruction_iske", VCPU_STAT(instruction_iske) },
+	{ "instruction_ri", VCPU_STAT(instruction_ri) },
+	{ "instruction_rrbe", VCPU_STAT(instruction_rrbe) },
+	{ "instruction_sske", VCPU_STAT(instruction_sske) },
 	{ "instruction_ipte_interlock", VCPU_STAT(instruction_ipte_interlock) },
-	{ "instruction_stsch", VCPU_STAT(instruction_stsch) },
-	{ "instruction_chsc", VCPU_STAT(instruction_chsc) },
 	{ "instruction_essa", VCPU_STAT(instruction_essa) },
 	{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
 	{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
+	{ "instruction_tb", VCPU_STAT(instruction_tb) },
+	{ "instruction_tpi", VCPU_STAT(instruction_tpi) },
 	{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+	{ "instruction_tsch", VCPU_STAT(instruction_tsch) },
 	{ "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
 	{ "instruction_sie", VCPU_STAT(instruction_sie) },
 	{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
@@ -124,6 +136,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "instruction_diag_258", VCPU_STAT(diagnose_258) },
 	{ "instruction_diag_308", VCPU_STAT(diagnose_308) },
 	{ "instruction_diag_500", VCPU_STAT(diagnose_500) },
+	{ "instruction_diag_other", VCPU_STAT(diagnose_other) },
 	{ NULL }
 };
 
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 572496c688cc0..dea0217922d6f 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -2,7 +2,7 @@
 /*
  * handling privileged instructions
  *
- * Copyright IBM Corp. 2008, 2013
+ * Copyright IBM Corp. 2008, 2018
  *
  *    Author(s): Carsten Otte <cotte@de.ibm.com>
  *               Christian Borntraeger <borntraeger@de.ibm.com>
@@ -34,6 +34,8 @@
 
 static int handle_ri(struct kvm_vcpu *vcpu)
 {
+	vcpu->stat.instruction_ri++;
+
 	if (test_kvm_facility(vcpu->kvm, 64)) {
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)");
 		vcpu->arch.sie_block->ecb3 |= ECB3_RI;
@@ -53,6 +55,8 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
 
 static int handle_gs(struct kvm_vcpu *vcpu)
 {
+	vcpu->stat.instruction_gs++;
+
 	if (test_kvm_facility(vcpu->kvm, 133)) {
 		VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)");
 		preempt_disable();
@@ -85,6 +89,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 	u8 ar;
 	u64 op2, val;
 
+	vcpu->stat.instruction_sck++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -222,7 +228,6 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)
 {
 	int rc;
 
-	vcpu->stat.instruction_storage_key++;
 	rc = kvm_s390_skey_check_enable(vcpu);
 	if (rc)
 		return rc;
@@ -242,6 +247,8 @@ static int handle_iske(struct kvm_vcpu *vcpu)
 	int reg1, reg2;
 	int rc;
 
+	vcpu->stat.instruction_iske++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -274,6 +281,8 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
 	int reg1, reg2;
 	int rc;
 
+	vcpu->stat.instruction_rrbe++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -312,6 +321,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 	int reg1, reg2;
 	int rc;
 
+	vcpu->stat.instruction_sske++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -392,6 +403,8 @@ static int handle_test_block(struct kvm_vcpu *vcpu)
 	gpa_t addr;
 	int reg2;
 
+	vcpu->stat.instruction_tb++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -424,6 +437,8 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
 	u64 addr;
 	u8 ar;
 
+	vcpu->stat.instruction_tpi++;
+
 	addr = kvm_s390_get_base_disp_s(vcpu, &ar);
 	if (addr & 3)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
@@ -484,6 +499,8 @@ static int handle_tsch(struct kvm_vcpu *vcpu)
 	struct kvm_s390_interrupt_info *inti = NULL;
 	const u64 isc_mask = 0xffUL << 24; /* all iscs set */
 
+	vcpu->stat.instruction_tsch++;
+
 	/* a valid schid has at least one bit set */
 	if (vcpu->run->s.regs.gprs[1])
 		inti = kvm_s390_get_io_int(vcpu->kvm, isc_mask,
@@ -527,6 +544,7 @@ static int handle_io_inst(struct kvm_vcpu *vcpu)
 		if (vcpu->arch.sie_block->ipa == 0xb235)
 			return handle_tsch(vcpu);
 		/* Handle in userspace. */
+		vcpu->stat.instruction_io_other++;
 		return -EOPNOTSUPP;
 	} else {
 		/*
@@ -592,6 +610,8 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 	int rc;
 	u8 ar;
 
+	vcpu->stat.instruction_lpsw++;
+
 	if (gpsw->mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -619,6 +639,8 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
 	int rc;
 	u8 ar;
 
+	vcpu->stat.instruction_lpswe++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -828,6 +850,8 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
 {
 	int reg1, reg2;
 
+	vcpu->stat.instruction_epsw++;
+
 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
 
 	/* This basically extracts the mask half of the psw. */
@@ -1332,6 +1356,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
 {
 	u32 value;
 
+	vcpu->stat.instruction_sckpf++;
+
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
@@ -1347,6 +1373,8 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
 
 static int handle_ptff(struct kvm_vcpu *vcpu)
 {
+	vcpu->stat.instruction_ptff++;
+
 	/* we don't emulate any control instructions yet */
 	kvm_s390_set_psw_cc(vcpu, 3);
 	return 0;

From 2018224df3174763d4ece26c103c99438e0c42fc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 23 Jan 2018 18:05:28 +0100
Subject: [PATCH 199/676] KVM: s390: rename __set_cpuflag() to
 kvm_s390_set_cpuflags()

No need to make this function special. Move it to a header right away.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180123170531.13687-2-david@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 29 ++++++++++++-----------------
 arch/s390/kvm/kvm-s390.h  |  5 +++++
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f8eb2cfa763ac..96ea3b80b67a4 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -301,17 +301,12 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
-{
-	atomic_or(flag, &vcpu->arch.sie_block->cpuflags);
-}
-
 static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 {
 	if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
 		return;
 	else if (psw_ioint_disabled(vcpu))
-		__set_cpuflag(vcpu, CPUSTAT_IO_INT);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT);
 	else
 		vcpu->arch.sie_block->lctl |= LCTL_CR6;
 }
@@ -321,7 +316,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 	if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
 		return;
 	if (psw_extint_disabled(vcpu))
-		__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	else
 		vcpu->arch.sie_block->lctl |= LCTL_CR0;
 }
@@ -339,7 +334,7 @@ static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_stop(struct kvm_vcpu *vcpu)
 {
 	if (kvm_s390_is_stop_irq_pending(vcpu))
-		__set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
 }
 
 /* Set interception request for non-deliverable interrupts */
@@ -1227,7 +1222,7 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 
 	li->irq.ext = irq->u.ext;
 	set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs);
-	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1252,7 +1247,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
 		return -EBUSY;
 	*extcall = irq->u.extcall;
-	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1296,7 +1291,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	if (test_and_set_bit(IRQ_PEND_SIGP_STOP, &li->pending_irqs))
 		return -EBUSY;
 	stop->flags = irq->u.stop.flags;
-	__set_cpuflag(vcpu, CPUSTAT_STOP_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
 	return 0;
 }
 
@@ -1328,7 +1323,7 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu,
 
 	set_bit(irq->u.emerg.code, li->sigp_emerg_pending);
 	set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs);
-	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1372,7 +1367,7 @@ static int __inject_ckc(struct kvm_vcpu *vcpu)
 				   0, 0);
 
 	set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs);
-	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1385,7 +1380,7 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu)
 				   0, 0);
 
 	set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
-	__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_EXT_INT);
 	return 0;
 }
 
@@ -1568,13 +1563,13 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 	/* make the VCPU drop out of the SIE, or wake it up if sleeping */
 	switch (type) {
 	case KVM_S390_MCHK:
-		__set_cpuflag(dst_vcpu, CPUSTAT_STOP_INT);
+		kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT);
 		break;
 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		__set_cpuflag(dst_vcpu, CPUSTAT_IO_INT);
+		kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT);
 		break;
 	default:
-		__set_cpuflag(dst_vcpu, CPUSTAT_EXT_INT);
+		kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT);
 		break;
 	}
 	kvm_s390_vcpu_wakeup(dst_vcpu);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8877116f01595..a9f2c8d1833dd 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -47,6 +47,11 @@ do { \
 	  d_args); \
 } while (0)
 
+static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+	atomic_or(flags, &vcpu->arch.sie_block->cpuflags);
+}
+
 static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 {
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;

From ef8f4f49fcfa56f7399db9886fcbb89f9f92a340 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 23 Jan 2018 18:05:29 +0100
Subject: [PATCH 200/676] KVM: s390: reuse kvm_s390_set_cpuflags()

Use it in all places where we set cpuflags.

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180123170531.13687-3-david@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c |  4 ++--
 arch/s390/kvm/kvm-s390.c  | 17 ++++++++---------
 arch/s390/kvm/vsie.c      |  2 +-
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 96ea3b80b67a4..404a127b1921a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -101,7 +101,7 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id)
 		/* another external call is pending */
 		return -EBUSY;
 	}
-	atomic_or(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
 	return 0;
 }
 
@@ -277,7 +277,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 
 static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 {
-	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
 	set_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8a3a1d50ef99d..9dc996acb52cf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2329,7 +2329,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 
 	gmap_enable(vcpu->arch.enabled_gmap);
-	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__start_cpu_timer_accounting(vcpu);
 	vcpu->cpu = cpu;
@@ -2436,9 +2436,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 						    CPUSTAT_STOPPED);
 
 	if (test_kvm_facility(vcpu->kvm, 78))
-		atomic_or(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED2);
 	else if (test_kvm_facility(vcpu->kvm, 8))
-		atomic_or(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_GED);
 
 	kvm_s390_vcpu_setup_model(vcpu);
 
@@ -2475,7 +2475,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
 
 	if (sclp.has_kss)
-		atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_KSS);
 	else
 		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
 
@@ -2578,7 +2578,7 @@ static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
  * return immediately. */
 void exit_sie(struct kvm_vcpu *vcpu)
 {
-	atomic_or(CPUSTAT_STOP_INT, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT);
 	while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE)
 		cpu_relax();
 }
@@ -2822,7 +2822,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	if (dbg->control & KVM_GUESTDBG_ENABLE) {
 		vcpu->guest_debug = dbg->control;
 		/* enforce guest PER */
-		atomic_or(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_set_cpuflags(vcpu, CPUSTAT_P);
 
 		if (dbg->control & KVM_GUESTDBG_USE_HW_BP)
 			rc = kvm_s390_import_bp_data(vcpu, dbg);
@@ -2911,8 +2911,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 	if (kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu)) {
 		if (!ibs_enabled(vcpu)) {
 			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 1);
-			atomic_or(CPUSTAT_IBS,
-					&vcpu->arch.sie_block->cpuflags);
+			kvm_s390_set_cpuflags(vcpu, CPUSTAT_IBS);
 		}
 		goto retry;
 	}
@@ -3591,7 +3590,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu)
 	/* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */
 	kvm_s390_clear_stop_irq(vcpu);
 
-	atomic_or(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOPPED);
 	__disable_ibs_on_vcpu(vcpu);
 
 	for (i = 0; i < online_vcpus; i++) {
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 6090f4235cfc0..87b3416390636 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -913,7 +913,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
 	 * External calls have to lead to a kick of the vcpu and
 	 * therefore the vsie -> Simulate Wait state.
 	 */
-	atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT);
 	/*
 	 * We have to adjust the g3 epoch by the g2 epoch. The epoch will
 	 * automatically be adjusted on tod clock changes via kvm_sync_clock.

From 9daecfc66015530ee5d2d84cce5d341f0fffd0ab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 23 Jan 2018 18:05:30 +0100
Subject: [PATCH 201/676] KVM: s390: introduce and use
 kvm_s390_clear_cpuflags()

Use it just like kvm_s390_set_cpuflags().

Suggested-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180123170531.13687-4-david@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c |  8 ++++----
 arch/s390/kvm/kvm-s390.c  | 11 +++++------
 arch/s390/kvm/kvm-s390.h  |  5 +++++
 arch/s390/kvm/priv.c      |  2 +-
 arch/s390/kvm/vsie.c      |  2 +-
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 404a127b1921a..8687aed9a268f 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -111,7 +111,7 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu)
 
 	if (!kvm_s390_use_sca_entries())
 		return;
-	atomic_andnot(CPUSTAT_ECALL_PEND, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND);
 	read_lock(&vcpu->kvm->arch.sca_lock);
 	if (vcpu->kvm->arch.use_esca) {
 		struct esca_block *sca = vcpu->kvm->arch.sca;
@@ -283,14 +283,14 @@ static void __set_cpu_idle(struct kvm_vcpu *vcpu)
 
 static void __unset_cpu_idle(struct kvm_vcpu *vcpu)
 {
-	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
 	clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.float_int.idle_mask);
 }
 
 static void __reset_intercept_indicators(struct kvm_vcpu *vcpu)
 {
-	atomic_andnot(CPUSTAT_IO_INT | CPUSTAT_EXT_INT | CPUSTAT_STOP_INT,
-		    &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IO_INT | CPUSTAT_EXT_INT |
+				      CPUSTAT_STOP_INT);
 	vcpu->arch.sie_block->lctl = 0x0000;
 	vcpu->arch.sie_block->ictl &= ~(ICTL_LPSW | ICTL_STCTL | ICTL_PINT);
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9dc996acb52cf..3eb97c4212cf6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2340,7 +2340,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
 		__stop_cpu_timer_accounting(vcpu);
-	atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
 	vcpu->arch.enabled_gmap = gmap_get_enabled();
 	gmap_disable(vcpu->arch.enabled_gmap);
 
@@ -2827,14 +2827,14 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		if (dbg->control & KVM_GUESTDBG_USE_HW_BP)
 			rc = kvm_s390_import_bp_data(vcpu, dbg);
 	} else {
-		atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
 		vcpu->arch.guestdbg.last_bp = 0;
 	}
 
 	if (rc) {
 		vcpu->guest_debug = 0;
 		kvm_s390_clear_bp_data(vcpu);
-		atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags);
+		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_P);
 	}
 
 	return rc;
@@ -2919,8 +2919,7 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 	if (kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu)) {
 		if (ibs_enabled(vcpu)) {
 			trace_kvm_s390_enable_disable_ibs(vcpu->vcpu_id, 0);
-			atomic_andnot(CPUSTAT_IBS,
-					  &vcpu->arch.sie_block->cpuflags);
+			kvm_s390_clear_cpuflags(vcpu, CPUSTAT_IBS);
 		}
 		goto retry;
 	}
@@ -3564,7 +3563,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
 		__disable_ibs_on_all_vcpus(vcpu->kvm);
 	}
 
-	atomic_andnot(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED);
 	/*
 	 * Another VCPU might have used IBS while we were offline.
 	 * Let's play safe and flush the VCPU at startup.
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index a9f2c8d1833dd..f727313a706f9 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -52,6 +52,11 @@ static inline void kvm_s390_set_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
 	atomic_or(flags, &vcpu->arch.sie_block->cpuflags);
 }
 
+static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+	atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags);
+}
+
 static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 {
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index dea0217922d6f..8205223f73322 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -216,7 +216,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
 	if (!rc) {
 		if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)
-			atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags);
+			kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
 		else
 			sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE |
 					     ICTL_RRBE);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 87b3416390636..6d494ed5907ee 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -935,7 +935,7 @@ static void register_shadow_scb(struct kvm_vcpu *vcpu,
  */
 static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 {
-	atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+	kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT);
 	WRITE_ONCE(vcpu->arch.vsie_block, NULL);
 }
 

From 8d5fb0dc4ec069ea02395593e9b6b2b39a92457e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 23 Jan 2018 18:05:31 +0100
Subject: [PATCH 202/676] KVM: s390: introduce and use kvm_s390_test_cpuflags()

Use it just like kvm_s390_set_cpuflags() and kvm_s390_clear_cpuflags().

Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20180123170531.13687-5-david@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c |  2 +-
 arch/s390/kvm/kvm-s390.c  |  2 +-
 arch/s390/kvm/kvm-s390.h  |  7 ++++++-
 arch/s390/kvm/priv.c      |  4 ++--
 arch/s390/kvm/sigp.c      | 14 +++++---------
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 8687aed9a268f..4b483b48436a9 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -36,7 +36,7 @@ static int sca_ext_call_pending(struct kvm_vcpu *vcpu, int *src_id)
 {
 	int c, scn;
 
-	if (!(atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_ECALL_PEND))
+	if (!kvm_s390_test_cpuflags(vcpu, CPUSTAT_ECALL_PEND))
 		return 0;
 
 	BUG_ON(!kvm_s390_use_sca_entries());
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3eb97c4212cf6..3dd209de2e0ca 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2875,7 +2875,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 
 static bool ibs_enabled(struct kvm_vcpu *vcpu)
 {
-	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS;
+	return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS);
 }
 
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f727313a706f9..f110fa96807e8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -57,9 +57,14 @@ static inline void kvm_s390_clear_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
 	atomic_andnot(flags, &vcpu->arch.sie_block->cpuflags);
 }
 
+static inline bool kvm_s390_test_cpuflags(struct kvm_vcpu *vcpu, u32 flags)
+{
+	return (atomic_read(&vcpu->arch.sie_block->cpuflags) & flags) == flags;
+}
+
 static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 {
-	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
+	return kvm_s390_test_cpuflags(vcpu, CPUSTAT_STOPPED);
 }
 
 static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 8205223f73322..125a7ff98e2ad 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -209,13 +209,13 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 
 	trace_kvm_s390_skey_related_inst(vcpu);
 	if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
-	    !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS))
+	    !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
 		return rc;
 
 	rc = s390_enable_skey();
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
 	if (!rc) {
-		if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)
+		if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
 			kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
 		else
 			sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE |
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 5cafd1e2651bc..683036c1c92a8 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -20,19 +20,18 @@
 static int __sigp_sense(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu,
 			u64 *reg)
 {
-	int cpuflags;
+	const bool stopped = kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED);
 	int rc;
 	int ext_call_pending;
 
-	cpuflags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
 	ext_call_pending = kvm_s390_ext_call_pending(dst_vcpu);
-	if (!(cpuflags & CPUSTAT_STOPPED) && !ext_call_pending)
+	if (!stopped && !ext_call_pending)
 		rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 	else {
 		*reg &= 0xffffffff00000000UL;
 		if (ext_call_pending)
 			*reg |= SIGP_STATUS_EXT_CALL_PENDING;
-		if (cpuflags & CPUSTAT_STOPPED)
+		if (stopped)
 			*reg |= SIGP_STATUS_STOPPED;
 		rc = SIGP_CC_STATUS_STORED;
 	}
@@ -205,11 +204,9 @@ static int __sigp_store_status_at_addr(struct kvm_vcpu *vcpu,
 				       struct kvm_vcpu *dst_vcpu,
 				       u32 addr, u64 *reg)
 {
-	int flags;
 	int rc;
 
-	flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
-	if (!(flags & CPUSTAT_STOPPED)) {
+	if (!kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_STOPPED)) {
 		*reg &= 0xffffffff00000000UL;
 		*reg |= SIGP_STATUS_INCORRECT_STATE;
 		return SIGP_CC_STATUS_STORED;
@@ -236,8 +233,7 @@ static int __sigp_sense_running(struct kvm_vcpu *vcpu,
 		return SIGP_CC_STATUS_STORED;
 	}
 
-	if (atomic_read(&dst_vcpu->arch.sie_block->cpuflags) &
-	    CPUSTAT_RUNNING) {
+	if (kvm_s390_test_cpuflags(dst_vcpu, CPUSTAT_RUNNING)) {
 		/* running */
 		rc = SIGP_CC_ORDER_CODE_ACCEPTED;
 	} else {

From c7901a6ebee4b624971361bbd93f21ab0b359786 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Thu, 29 Jun 2017 18:39:27 +0200
Subject: [PATCH 203/676] KVM: s390: reverse bit ordering of irqs in pending
 mask

This patch prepares a simplification of bit operations between the irq
pending mask for emulated interrupts and the Interruption Pending Mask
(IPM) which is part of the Guest Interruption State Area (GISA), a feature
that allows interrupt delivery to guests by means of the SIE instruction.

Without that change, a bit-wise *or* operation on parts of these two masks
would either require a look-up table of size 256 bytes to map the IPM
to the emulated irq pending mask bit orientation (all bits mirrored at half
byte) or a sequence of up to 8 condidional branches to perform tests of
single bit positions. Both options are to be rejected either by performance
or space utilization reasons.

Beyond that this change will be transparent.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h | 54 ++++++++++++++++----------------
 arch/s390/kvm/interrupt.c        | 12 +++----
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 029e8144c6ecc..d2351d63afa3b 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -422,35 +422,35 @@ struct kvm_vcpu_stat {
 #define PGM_PER				0x80
 #define PGM_CRYPTO_OPERATION		0x119
 
-/* irq types in order of priority */
+/* irq types in ascend order of priorities */
 enum irq_types {
-	IRQ_PEND_MCHK_EX = 0,
-	IRQ_PEND_SVC,
-	IRQ_PEND_PROG,
-	IRQ_PEND_MCHK_REP,
-	IRQ_PEND_EXT_IRQ_KEY,
-	IRQ_PEND_EXT_MALFUNC,
-	IRQ_PEND_EXT_EMERGENCY,
-	IRQ_PEND_EXT_EXTERNAL,
-	IRQ_PEND_EXT_CLOCK_COMP,
-	IRQ_PEND_EXT_CPU_TIMER,
-	IRQ_PEND_EXT_TIMING,
-	IRQ_PEND_EXT_SERVICE,
-	IRQ_PEND_EXT_HOST,
-	IRQ_PEND_PFAULT_INIT,
-	IRQ_PEND_PFAULT_DONE,
-	IRQ_PEND_VIRTIO,
-	IRQ_PEND_IO_ISC_0,
-	IRQ_PEND_IO_ISC_1,
-	IRQ_PEND_IO_ISC_2,
-	IRQ_PEND_IO_ISC_3,
-	IRQ_PEND_IO_ISC_4,
-	IRQ_PEND_IO_ISC_5,
-	IRQ_PEND_IO_ISC_6,
-	IRQ_PEND_IO_ISC_7,
-	IRQ_PEND_SIGP_STOP,
+	IRQ_PEND_SET_PREFIX = 0,
 	IRQ_PEND_RESTART,
-	IRQ_PEND_SET_PREFIX,
+	IRQ_PEND_SIGP_STOP,
+	IRQ_PEND_IO_ISC_7,
+	IRQ_PEND_IO_ISC_6,
+	IRQ_PEND_IO_ISC_5,
+	IRQ_PEND_IO_ISC_4,
+	IRQ_PEND_IO_ISC_3,
+	IRQ_PEND_IO_ISC_2,
+	IRQ_PEND_IO_ISC_1,
+	IRQ_PEND_IO_ISC_0,
+	IRQ_PEND_VIRTIO,
+	IRQ_PEND_PFAULT_DONE,
+	IRQ_PEND_PFAULT_INIT,
+	IRQ_PEND_EXT_HOST,
+	IRQ_PEND_EXT_SERVICE,
+	IRQ_PEND_EXT_TIMING,
+	IRQ_PEND_EXT_CPU_TIMER,
+	IRQ_PEND_EXT_CLOCK_COMP,
+	IRQ_PEND_EXT_EXTERNAL,
+	IRQ_PEND_EXT_EMERGENCY,
+	IRQ_PEND_EXT_MALFUNC,
+	IRQ_PEND_EXT_IRQ_KEY,
+	IRQ_PEND_MCHK_REP,
+	IRQ_PEND_PROG,
+	IRQ_PEND_SVC,
+	IRQ_PEND_MCHK_EX,
 	IRQ_PEND_COUNT
 };
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 4b483b48436a9..b731d4fede837 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -189,8 +189,8 @@ static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
 
 static inline int is_ioirq(unsigned long irq_type)
 {
-	return ((irq_type >= IRQ_PEND_IO_ISC_0) &&
-		(irq_type <= IRQ_PEND_IO_ISC_7));
+	return ((irq_type >= IRQ_PEND_IO_ISC_7) &&
+		(irq_type <= IRQ_PEND_IO_ISC_0));
 }
 
 static uint64_t isc_to_isc_bits(int isc)
@@ -211,12 +211,12 @@ static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 
 static inline int isc_to_irq_type(unsigned long isc)
 {
-	return IRQ_PEND_IO_ISC_0 + isc;
+	return IRQ_PEND_IO_ISC_0 - isc;
 }
 
 static inline int irq_type_to_isc(unsigned long irq_type)
 {
-	return irq_type - IRQ_PEND_IO_ISC_0;
+	return IRQ_PEND_IO_ISC_0 - irq_type;
 }
 
 static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@@ -1149,8 +1149,8 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 		set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
 
 	while ((irqs = deliverable_irqs(vcpu)) && !rc) {
-		/* bits are in the order of interrupt priority */
-		irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
+		/* bits are in the reverse order of interrupt priority */
+		irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT);
 		if (is_ioirq(irq_type)) {
 			rc = __deliver_io(vcpu, irq_type);
 		} else {

From 19114beb73f774e466d9e39b8e8b961812c9f881 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Tue, 30 May 2017 14:26:02 +0200
Subject: [PATCH 204/676] KVM: s390: define GISA format-0 data structure

In preperation to support pass-through adapter interrupts, the Guest
Interruption State Area (GISA) and the Adapter Interruption Virtualization
(AIV) features will be introduced here.

This patch introduces format-0 GISA (that is defines the struct describing
the GISA, allocates storage for it, and introduces fields for the
GISA address in kvm_s390_sie_block and kvm_s390_vsie).

As the GISA requires storage below 2GB, it is put in sie_page2, which is
already allocated in ZONE_DMA. In addition, The GISA requires alignment to
its integral boundary. This is already naturally aligned via the
padding in the sie_page2.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h | 23 +++++++++++++++++++----
 arch/s390/kvm/kvm-s390.c         |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d2351d63afa3b..4f89b2783512e 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -183,6 +183,7 @@ struct kvm_s390_sie_block {
 #define ECA_IB		0x40000000
 #define ECA_SIGPI	0x10000000
 #define ECA_MVPGI	0x01000000
+#define ECA_AIV		0x00200000
 #define ECA_VX		0x00020000
 #define ECA_PROTEXCI	0x00002000
 #define ECA_SII		0x00000001
@@ -227,7 +228,8 @@ struct kvm_s390_sie_block {
 	__u8    epdx;			/* 0x0069 */
 	__u8    reserved6a[2];		/* 0x006a */
 	__u32	todpr;			/* 0x006c */
-	__u8	reserved70[16];		/* 0x0070 */
+	__u32	gd;			/* 0x0070 */
+	__u8	reserved74[12];		/* 0x0074 */
 	__u64	mso;			/* 0x0080 */
 	__u64	msl;			/* 0x0088 */
 	psw_t	gpsw;			/* 0x0090 */
@@ -716,14 +718,27 @@ struct kvm_s390_crypto_cb {
 	struct kvm_s390_apcb1 apcb1;		/* 0x0080 */
 };
 
+struct kvm_s390_gisa {
+	u32 next_alert;
+	u8 ipm;
+	u8 reserved01;
+	u8 : 6;
+	u8 g : 1;
+	u8 c : 1;
+	u8 iam;
+	u8 reserved02[4];
+	u32 airq_count;
+};
+
 /*
- * sie_page2 has to be allocated as DMA because fac_list and crycb need
- * 31bit addresses in the sie control block.
+ * sie_page2 has to be allocated as DMA because fac_list, crycb and
+ * gisa need 31bit addresses in the sie control block.
  */
 struct sie_page2 {
 	__u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];	/* 0x0000 */
 	struct kvm_s390_crypto_cb crycb;		/* 0x0800 */
-	u8 reserved900[0x1000 - 0x900];			/* 0x0900 */
+	struct kvm_s390_gisa gisa;			/* 0x0900 */
+	u8 reserved910[0x1000 - 0x910];			/* 0x0910 */
 };
 
 struct kvm_s390_vsie {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3dd209de2e0ca..2fbdb16010894 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1928,6 +1928,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (!kvm->arch.dbf)
 		goto out_err;
 
+	BUILD_BUG_ON(sizeof(struct sie_page2) != 4096);
 	kvm->arch.sie_page2 =
 	     (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
 	if (!kvm->arch.sie_page2)

From f3ec471a9832f8365836d70db4d52db9cc4dd6b1 Mon Sep 17 00:00:00 2001
From: Jens Freimann <jfrei@linux.vnet.ibm.com>
Date: Thu, 16 Mar 2017 17:45:40 +0100
Subject: [PATCH 205/676] s390/bitops: add test_and_clear_bit_inv()

This patch adds a MSB0 bit numbering version of test_and_clear_bit().

Signed-off-by: Jens Freimann <jfrei@linux.vnet.ibm.com>
Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/bitops.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 31e400c1a1f35..86e5b2fdee3c8 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -261,6 +261,11 @@ static inline void clear_bit_inv(unsigned long nr, volatile unsigned long *ptr)
 	return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr);
 }
 
+static inline int test_and_clear_bit_inv(unsigned long nr, volatile unsigned long *ptr)
+{
+	return test_and_clear_bit(nr ^ (BITS_PER_LONG - 1), ptr);
+}
+
 static inline void __set_bit_inv(unsigned long nr, volatile unsigned long *ptr)
 {
 	return __set_bit(nr ^ (BITS_PER_LONG - 1), ptr);

From d77e64141e322a3202de71a4afa7956c98d2a302 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Mon, 12 Jun 2017 12:37:57 +0200
Subject: [PATCH 206/676] KVM: s390: implement GISA IPM related primitives

The patch implements routines to access the GISA to test and modify
its Interruption Pending Mask (IPM) from the host side.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index b731d4fede837..8985ce51f687a 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -203,6 +203,34 @@ static inline u8 int_word_to_isc(u32 int_word)
 	return (int_word & 0x38000000) >> 27;
 }
 
+/*
+ * To use atomic bitmap functions, we have to provide a bitmap address
+ * that is u64 aligned. However, the ipm might be u32 aligned.
+ * Therefore, we logically start the bitmap at the very beginning of the
+ * struct and fixup the bit number.
+ */
+#define IPM_BIT_OFFSET (offsetof(struct kvm_s390_gisa, ipm) * BITS_PER_BYTE)
+
+static inline void kvm_s390_gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+	set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
+static inline u8 kvm_s390_gisa_get_ipm(struct kvm_s390_gisa *gisa)
+{
+	return READ_ONCE(gisa->ipm);
+}
+
+static inline void kvm_s390_gisa_clear_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+	clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
+static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc)
+{
+	return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa);
+}
+
 static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->arch.float_int.pending_irqs |

From 72b523a30d9b9c7f749c3209e13de983aeb919c1 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Mon, 10 Jul 2017 11:03:42 +0200
Subject: [PATCH 207/676] s390/css: indicate the availability of the AIV
 facility

The patch adds an indication for the presence Adapter Interruption
Virtualization facility (AIV) of the general channel subsystem
characteristics.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
[change wording]
---
 arch/s390/include/asm/css_chars.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h
index a478eb61aaf7f..fb56fa3283a2c 100644
--- a/arch/s390/include/asm/css_chars.h
+++ b/arch/s390/include/asm/css_chars.h
@@ -20,7 +20,9 @@ struct css_general_char {
 	u32 aif_tdd : 1; /* bit 56 */
 	u32 : 1;
 	u32 qebsm : 1;	 /* bit 58 */
-	u32 : 8;
+	u32 : 2;
+	u32 aiv : 1;     /* bit 61 */
+	u32 : 5;
 	u32 aif_osa : 1; /* bit 67 */
 	u32 : 12;
 	u32 eadm_rf : 1; /* bit 80 */

From d7c5cb0105ddeff56694f4c6222ee7221824bad3 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Mon, 12 Jun 2017 14:15:19 +0200
Subject: [PATCH 208/676] KVM: s390: exploit GISA and AIV for emulated
 interrupts

The adapter interruption virtualization (AIV) facility is an
optional facility that comes with functionality expected to increase
the performance of adapter interrupt handling for both emulated and
passed-through adapter interrupts. With AIV, adapter interrupts can be
delivered to the guest without exiting SIE.

This patch provides some preparations for using AIV for emulated adapter
interrupts (including virtio) if it's available. When using AIV, the
interrupts are delivered at the so called GISA by setting the bit
corresponding to its Interruption Subclass (ISC) in the Interruption
Pending Mask (IPM) instead of inserting a node into the floating interrupt
list.

To keep the change reasonably small, the handling of this new state is
deferred in get_all_floating_irqs and handle_tpi. This patch concentrates
on the code handling enqueuement of emulated adapter interrupts, and their
delivery to the guest.

Note that care is still required for adapter interrupts using AIV,
because there is no guarantee that AIV is going to deliver the adapter
interrupts pending at the GISA (consider all vcpus idle). When delivering
GISA adapter interrupts by the host (usual mechanism) special attention
is required to honor interrupt priorities.

Empirical results show that the time window between making an interrupt
pending at the GISA and doing kvm_s390_deliver_pending_interrupts is
sufficient for a guest with at least moderate cpu activity to get adapter
interrupts delivered within the SIE, and potentially save some SIE exits
(if not other deliverable interrupts).

The code will be activated with a follow-up patch.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h |  1 +
 arch/s390/kvm/interrupt.c        | 93 +++++++++++++++++++++++++-------
 arch/s390/kvm/kvm-s390.c         |  8 +++
 arch/s390/kvm/kvm-s390.h         |  3 ++
 4 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 4f89b2783512e..76d6f0ef3dbdc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -785,6 +785,7 @@ struct kvm_arch{
 	struct kvm_s390_migration_state *migration_state;
 	/* subset of available cpu features enabled by user space */
 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+	struct kvm_s390_gisa *gisa;
 };
 
 #define KVM_HVA_ERR_BAD		(-1UL)
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 8985ce51f687a..f293c956e6dbb 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -234,7 +234,8 @@ static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gis
 static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->arch.float_int.pending_irqs |
-	       vcpu->arch.local_int.pending_irqs;
+		vcpu->arch.local_int.pending_irqs |
+		kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7;
 }
 
 static inline int isc_to_irq_type(unsigned long isc)
@@ -919,18 +920,38 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu)
 	return rc ? -EFAULT : 0;
 }
 
+static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io)
+{
+	int rc;
+
+	rc  = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID);
+	rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR);
+	rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM);
+	rc |= put_guest_lc(vcpu, io->io_int_word, (u32 *)__LC_IO_INT_WORD);
+	rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
+			     &vcpu->arch.sie_block->gpsw,
+			     sizeof(psw_t));
+	rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
+			    &vcpu->arch.sie_block->gpsw,
+			    sizeof(psw_t));
+	return rc ? -EFAULT : 0;
+}
+
 static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
 				     unsigned long irq_type)
 {
 	struct list_head *isc_list;
 	struct kvm_s390_float_interrupt *fi;
 	struct kvm_s390_interrupt_info *inti = NULL;
+	struct kvm_s390_io_info io;
+	u32 isc;
 	int rc = 0;
 
 	fi = &vcpu->kvm->arch.float_int;
 
 	spin_lock(&fi->lock);
-	isc_list = &fi->lists[irq_type_to_isc(irq_type)];
+	isc = irq_type_to_isc(irq_type);
+	isc_list = &fi->lists[isc];
 	inti = list_first_entry_or_null(isc_list,
 					struct kvm_s390_interrupt_info,
 					list);
@@ -958,24 +979,31 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
 	spin_unlock(&fi->lock);
 
 	if (inti) {
-		rc  = put_guest_lc(vcpu, inti->io.subchannel_id,
-				(u16 *)__LC_SUBCHANNEL_ID);
-		rc |= put_guest_lc(vcpu, inti->io.subchannel_nr,
-				(u16 *)__LC_SUBCHANNEL_NR);
-		rc |= put_guest_lc(vcpu, inti->io.io_int_parm,
-				(u32 *)__LC_IO_INT_PARM);
-		rc |= put_guest_lc(vcpu, inti->io.io_int_word,
-				(u32 *)__LC_IO_INT_WORD);
-		rc |= write_guest_lc(vcpu, __LC_IO_OLD_PSW,
-				&vcpu->arch.sie_block->gpsw,
-				sizeof(psw_t));
-		rc |= read_guest_lc(vcpu, __LC_IO_NEW_PSW,
-				&vcpu->arch.sie_block->gpsw,
-				sizeof(psw_t));
+		rc = __do_deliver_io(vcpu, &(inti->io));
 		kfree(inti);
+		goto out;
 	}
 
-	return rc ? -EFAULT : 0;
+	if (vcpu->kvm->arch.gisa &&
+	    kvm_s390_gisa_tac_ipm_gisc(vcpu->kvm->arch.gisa, isc)) {
+		/*
+		 * in case an adapter interrupt was not delivered
+		 * in SIE context KVM will handle the delivery
+		 */
+		VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc);
+		memset(&io, 0, sizeof(io));
+		io.io_int_word = (isc << 27) | 0x80000000;
+		vcpu->stat.deliver_io_int++;
+		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
+			KVM_S390_INT_IO(1, 0, 0, 0),
+			((__u32)io.subchannel_id << 16) |
+			io.subchannel_nr,
+			((__u64)io.io_int_parm << 32) |
+			io.io_int_word);
+		rc = __do_deliver_io(vcpu, &io);
+	}
+out:
+	return rc;
 }
 
 typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu);
@@ -1539,6 +1567,15 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 	struct list_head *list;
 	int isc;
 
+	isc = int_word_to_isc(inti->io.io_int_word);
+
+	if (kvm->arch.gisa && inti->type & KVM_S390_INT_IO_AI_MASK) {
+		VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc);
+		kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+		kfree(inti);
+		return 0;
+	}
+
 	fi = &kvm->arch.float_int;
 	spin_lock(&fi->lock);
 	if (fi->counters[FIRQ_CNTR_IO] >= KVM_S390_MAX_FLOAT_IRQS) {
@@ -1554,7 +1591,6 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 			inti->io.subchannel_id >> 8,
 			inti->io.subchannel_id >> 1 & 0x3,
 			inti->io.subchannel_nr);
-	isc = int_word_to_isc(inti->io.io_int_word);
 	list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
 	list_add_tail(&inti->list, list);
 	set_bit(isc_to_irq_type(isc), &fi->pending_irqs);
@@ -2705,3 +2741,24 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len)
 
 	return n;
 }
+
+void kvm_s390_gisa_clear(struct kvm *kvm)
+{
+	if (kvm->arch.gisa) {
+		memset(kvm->arch.gisa, 0, sizeof(struct kvm_s390_gisa));
+		kvm->arch.gisa->next_alert = (u32)(u64)kvm->arch.gisa;
+		VM_EVENT(kvm, 3, "gisa 0x%pK cleared", kvm->arch.gisa);
+	}
+}
+
+void kvm_s390_gisa_init(struct kvm *kvm)
+{
+	/* not implemented yet */
+}
+
+void kvm_s390_gisa_destroy(struct kvm *kvm)
+{
+	if (!kvm->arch.gisa)
+		return;
+	kvm->arch.gisa = NULL;
+}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2fbdb16010894..2c5e25b394352 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1999,6 +1999,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	spin_lock_init(&kvm->arch.start_stop_lock);
 	kvm_s390_vsie_init(kvm);
+	kvm_s390_gisa_init(kvm);
 	KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
 
 	return 0;
@@ -2061,6 +2062,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kvm_free_vcpus(kvm);
 	sca_dispose(kvm);
 	debug_unregister(kvm->arch.dbf);
+	kvm_s390_gisa_destroy(kvm);
 	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
 		gmap_remove(kvm->arch.gmap);
@@ -2471,6 +2473,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	if (test_kvm_facility(vcpu->kvm, 139))
 		vcpu->arch.sie_block->ecd |= ECD_MEF;
 
+	if (vcpu->arch.sie_block->gd) {
+		vcpu->arch.sie_block->eca |= ECA_AIV;
+		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
+			   vcpu->arch.sie_block->gd & 0x3, vcpu->vcpu_id);
+	}
 	vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx)
 					| SDNXC;
 	vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb;
@@ -2523,6 +2530,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 	vcpu->arch.sie_block->icpua = id;
 	spin_lock_init(&vcpu->arch.local_int.lock);
+	vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa;
 	seqcount_init(&vcpu->arch.cputm_seqcount);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index f110fa96807e8..bd31b37b0e6f8 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -382,6 +382,9 @@ int kvm_s390_set_irq_state(struct kvm_vcpu *vcpu,
 			   void __user *buf, int len);
 int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu,
 			   __u8 __user *buf, int len);
+void kvm_s390_gisa_init(struct kvm *kvm);
+void kvm_s390_gisa_clear(struct kvm *kvm);
+void kvm_s390_gisa_destroy(struct kvm *kvm);
 
 /* implemented in guestdbg.c */
 void kvm_s390_backup_guest_per_regs(struct kvm_vcpu *vcpu);

From 2496c8e7fe9270bde5e125729c193120e7fa8c67 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Thu, 31 Aug 2017 11:10:28 +0200
Subject: [PATCH 209/676] KVM: s390: abstract adapter interruption word
 generation from ISC

The function isc_to_int_word() allows the generation of interruption
words for adapter interrupts.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f293c956e6dbb..a0ded3a23a5e0 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -198,6 +198,11 @@ static uint64_t isc_to_isc_bits(int isc)
 	return (0x80 >> isc) << 24;
 }
 
+static inline u32 isc_to_int_word(u8 isc)
+{
+	return ((u32)isc << 27) | 0x80000000;
+}
+
 static inline u8 int_word_to_isc(u32 int_word)
 {
 	return (int_word & 0x38000000) >> 27;
@@ -992,7 +997,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
 		 */
 		VCPU_EVENT(vcpu, 4, "%s isc %u", "deliver: I/O (AI/gisa)", isc);
 		memset(&io, 0, sizeof(io));
-		io.io_int_word = (isc << 27) | 0x80000000;
+		io.io_int_word = isc_to_int_word(isc);
 		vcpu->stat.deliver_io_int++;
 		trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
 			KVM_S390_INT_IO(1, 0, 0, 0),
@@ -2299,7 +2304,7 @@ static int kvm_s390_inject_airq(struct kvm *kvm,
 	struct kvm_s390_interrupt s390int = {
 		.type = KVM_S390_INT_IO(1, 0, 0, 0),
 		.parm = 0,
-		.parm64 = (adapter->isc << 27) | 0x80000000,
+		.parm64 = isc_to_int_word(adapter->isc),
 	};
 	int ret = 0;
 

From 24160af6cb289ace9bde980b33d11713c8fc8192 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Wed, 14 Jun 2017 13:21:32 +0200
Subject: [PATCH 210/676] KVM: s390: add GISA interrupts to FLIC ioctl
 interface

Pending interrupts marked in the GISA IPM are required to
become part of the answer of ioctl KVM_DEV_FLIC_GET_ALL_IRQS.

The ioctl KVM_DEV_FLIC_ENQUEUE is already capable to enqueue
adapter interrupts when a GISA is present.

With ioctl KVM_DEV_FLIC_CLEAR_IRQS the GISA IPM wil be cleared
now as well.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index a0ded3a23a5e0..dd4c50b82a532 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1879,6 +1879,7 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
 	for (i = 0; i < FIRQ_MAX_COUNT; i++)
 		fi->counters[i] = 0;
 	spin_unlock(&fi->lock);
+	kvm_s390_gisa_clear(kvm);
 };
 
 static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
@@ -1906,6 +1907,22 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 
 	max_irqs = len / sizeof(struct kvm_s390_irq);
 
+	if (kvm->arch.gisa &&
+	    kvm_s390_gisa_get_ipm(kvm->arch.gisa)) {
+		for (i = 0; i <= MAX_ISC; i++) {
+			if (n == max_irqs) {
+				/* signal userspace to try again */
+				ret = -ENOMEM;
+				goto out_nolock;
+			}
+			if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, i)) {
+				irq = (struct kvm_s390_irq *) &buf[n];
+				irq->type = KVM_S390_INT_IO(1, 0, 0, 0);
+				irq->u.io.io_int_word = isc_to_int_word(i);
+				n++;
+			}
+		}
+	}
 	fi = &kvm->arch.float_int;
 	spin_lock(&fi->lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++) {
@@ -1944,6 +1961,7 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len)
 
 out:
 	spin_unlock(&fi->lock);
+out_nolock:
 	if (!ret && n > 0) {
 		if (copy_to_user(usrbuf, buf, sizeof(struct kvm_s390_irq) * n))
 			ret = -EFAULT;

From 4b35f65e67ee0e5bc4f394efa14a9fa3917cd2c1 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Fri, 7 Jul 2017 15:27:31 +0200
Subject: [PATCH 211/676] KVM: s390: make kvm_s390_get_io_int() aware of GISA

The function returns a pending I/O interrupt with the highest
priority defined by its ISC.

Together with AIV activation, pending adapter interrupts are
managed by the GISA IPM. Thus kvm_s390_get_io_int() needs to
inspect the IPM as well when the interrupt with the highest
priority has to be identified.

In case classic and adapter interrupts with the same ISC are
pending, the classic interrupt will be returned first.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 74 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 4 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index dd4c50b82a532..f2dc86884ea42 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1471,20 +1471,86 @@ static struct kvm_s390_interrupt_info *get_io_int(struct kvm *kvm,
 	return NULL;
 }
 
+static struct kvm_s390_interrupt_info *get_top_io_int(struct kvm *kvm,
+						      u64 isc_mask, u32 schid)
+{
+	struct kvm_s390_interrupt_info *inti = NULL;
+	int isc;
+
+	for (isc = 0; isc <= MAX_ISC && !inti; isc++) {
+		if (isc_mask & isc_to_isc_bits(isc))
+			inti = get_io_int(kvm, isc, schid);
+	}
+	return inti;
+}
+
+static int get_top_gisa_isc(struct kvm *kvm, u64 isc_mask, u32 schid)
+{
+	unsigned long active_mask;
+	int isc;
+
+	if (schid)
+		goto out;
+	if (!kvm->arch.gisa)
+		goto out;
+
+	active_mask = (isc_mask & kvm_s390_gisa_get_ipm(kvm->arch.gisa) << 24) << 32;
+	while (active_mask) {
+		isc = __fls(active_mask) ^ (BITS_PER_LONG - 1);
+		if (kvm_s390_gisa_tac_ipm_gisc(kvm->arch.gisa, isc))
+			return isc;
+		clear_bit_inv(isc, &active_mask);
+	}
+out:
+	return -EINVAL;
+}
+
 /*
  * Dequeue and return an I/O interrupt matching any of the interruption
  * subclasses as designated by the isc mask in cr6 and the schid (if != 0).
+ * Take into account the interrupts pending in the interrupt list and in GISA.
+ *
+ * Note that for a guest that does not enable I/O interrupts
+ * but relies on TPI, a flood of classic interrupts may starve
+ * out adapter interrupts on the same isc. Linux does not do
+ * that, and it is possible to work around the issue by configuring
+ * different iscs for classic and adapter interrupts in the guest,
+ * but we may want to revisit this in the future.
  */
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 						    u64 isc_mask, u32 schid)
 {
-	struct kvm_s390_interrupt_info *inti = NULL;
+	struct kvm_s390_interrupt_info *inti, *tmp_inti;
 	int isc;
 
-	for (isc = 0; isc <= MAX_ISC && !inti; isc++) {
-		if (isc_mask & isc_to_isc_bits(isc))
-			inti = get_io_int(kvm, isc, schid);
+	inti = get_top_io_int(kvm, isc_mask, schid);
+
+	isc = get_top_gisa_isc(kvm, isc_mask, schid);
+	if (isc < 0)
+		/* no AI in GISA */
+		goto out;
+
+	if (!inti)
+		/* AI in GISA but no classical IO int */
+		goto gisa_out;
+
+	/* both types of interrupts present */
+	if (int_word_to_isc(inti->io.io_int_word) <= isc) {
+		/* classical IO int with higher priority */
+		kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+		goto out;
 	}
+gisa_out:
+	tmp_inti = kzalloc(sizeof(*inti), GFP_KERNEL);
+	if (tmp_inti) {
+		tmp_inti->type = KVM_S390_INT_IO(1, 0, 0, 0);
+		tmp_inti->io.io_int_word = isc_to_int_word(isc);
+		if (inti)
+			kvm_s390_reinject_io_int(kvm, inti);
+		inti = tmp_inti;
+	} else
+		kvm_s390_gisa_set_ipm_gisc(kvm->arch.gisa, isc);
+out:
 	return inti;
 }
 

From f180bfdae024b34e71e89dcc82b037dd97f74c3a Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Fri, 23 Jun 2017 14:46:21 +0200
Subject: [PATCH 212/676] KVM: s390: activate GISA for emulated interrupts

If the AIV facility is available, a GISA will be used to manage emulated
adapter interrupts.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f2dc86884ea42..488ecc7ea2a18 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2842,7 +2842,13 @@ void kvm_s390_gisa_clear(struct kvm *kvm)
 
 void kvm_s390_gisa_init(struct kvm *kvm)
 {
-	/* not implemented yet */
+	if (!css_general_characteristics.aiv)
+		kvm->arch.gisa = NULL;
+	else {
+		kvm->arch.gisa = &kvm->arch.sie_page2->gisa;
+		VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa);
+		kvm_s390_gisa_clear(kvm);
+	}
 }
 
 void kvm_s390_gisa_destroy(struct kvm *kvm)

From 9e73ea7056bd5f7e1b6e66e3d503478bb5160f07 Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Mon, 12 Jun 2017 13:49:28 +0200
Subject: [PATCH 213/676] s390/sclp: expose the GISA format facility

The GISA format facility is required by the host to be able to process
a format-1 GISA. If not available, the used GISA format will be format-0.
All format-1 related extension will not be available in this case.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/sclp.h   | 1 +
 drivers/s390/char/sclp_early.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index d3c1a8a2e3ad4..3cae9168f63c4 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -77,6 +77,7 @@ struct sclp_info {
 	unsigned char has_ibs : 1;
 	unsigned char has_skey : 1;
 	unsigned char has_kss : 1;
+	unsigned char has_gisaf : 1;
 	unsigned int ibc;
 	unsigned int mtid;
 	unsigned int mtid_cp;
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index d06bc5674e5f9..6b1891539c840 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -49,7 +49,7 @@ struct read_info_sccb {
 	u8	_pad_112[116 - 112];	/* 112-115 */
 	u8	fac116;			/* 116 */
 	u8	fac117;			/* 117 */
-	u8	_pad_118;		/* 118 */
+	u8	fac118;			/* 118 */
 	u8	fac119;			/* 119 */
 	u16	hcpua;			/* 120-121 */
 	u8	_pad_122[124 - 122];	/* 122-123 */
@@ -100,6 +100,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb)
 	sclp.has_esca = !!(sccb->fac116 & 0x08);
 	sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
 	sclp.has_ibs = !!(sccb->fac117 & 0x20);
+	sclp.has_gisaf = !!(sccb->fac118 & 0x08);
 	sclp.has_hvs = !!(sccb->fac119 & 0x80);
 	sclp.has_kss = !!(sccb->fac98 & 0x01);
 	if (sccb->fac85 & 0x02)

From 4b9f952577fb40875a2a163d80515a8daa0d6bef Mon Sep 17 00:00:00 2001
From: Michael Mueller <mimu@linux.vnet.ibm.com>
Date: Fri, 23 Jun 2017 13:51:25 +0200
Subject: [PATCH 214/676] KVM: s390: introduce the format-1 GISA

The patch modifies the previously defined GISA data structure to be
able to store two GISA formats, format-0 and format-1. Additionally,
it verifies the availability of the GISA format facility and enables
the use of a format-1 GISA in the SIE control block accordingly.

A format-1 can do everything that format-0 can and we will need it
for real HW passthrough. As there are systems with only format-0
we keep both variants.

Signed-off-by: Michael Mueller <mimu@linux.vnet.ibm.com>
Reviewed-by: Pierre Morel <pmorel@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/include/asm/kvm_host.h | 44 ++++++++++++++++++++++++--------
 arch/s390/kvm/interrupt.c        |  4 +--
 arch/s390/kvm/kvm-s390.c         |  2 ++
 3 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 76d6f0ef3dbdc..59dd46adf0e86 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -228,6 +228,7 @@ struct kvm_s390_sie_block {
 	__u8    epdx;			/* 0x0069 */
 	__u8    reserved6a[2];		/* 0x006a */
 	__u32	todpr;			/* 0x006c */
+#define GISA_FORMAT1 0x00000001
 	__u32	gd;			/* 0x0070 */
 	__u8	reserved74[12];		/* 0x0074 */
 	__u64	mso;			/* 0x0080 */
@@ -719,15 +720,38 @@ struct kvm_s390_crypto_cb {
 };
 
 struct kvm_s390_gisa {
-	u32 next_alert;
-	u8 ipm;
-	u8 reserved01;
-	u8 : 6;
-	u8 g : 1;
-	u8 c : 1;
-	u8 iam;
-	u8 reserved02[4];
-	u32 airq_count;
+	union {
+		struct { /* common to all formats */
+			u32 next_alert;
+			u8  ipm;
+			u8  reserved01[2];
+			u8  iam;
+		};
+		struct { /* format 0 */
+			u32 next_alert;
+			u8  ipm;
+			u8  reserved01;
+			u8  : 6;
+			u8  g : 1;
+			u8  c : 1;
+			u8  iam;
+			u8  reserved02[4];
+			u32 airq_count;
+		} g0;
+		struct { /* format 1 */
+			u32 next_alert;
+			u8  ipm;
+			u8  simm;
+			u8  nimm;
+			u8  iam;
+			u8  aism[8];
+			u8  : 6;
+			u8  g : 1;
+			u8  c : 1;
+			u8  reserved03[11];
+			u32 airq_count;
+		} g1;
+	};
 };
 
 /*
@@ -738,7 +762,7 @@ struct sie_page2 {
 	__u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];	/* 0x0000 */
 	struct kvm_s390_crypto_cb crycb;		/* 0x0800 */
 	struct kvm_s390_gisa gisa;			/* 0x0900 */
-	u8 reserved910[0x1000 - 0x910];			/* 0x0910 */
+	u8 reserved920[0x1000 - 0x920];			/* 0x0920 */
 };
 
 struct kvm_s390_vsie {
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 488ecc7ea2a18..aabf46f5f883d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2842,9 +2842,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm)
 
 void kvm_s390_gisa_init(struct kvm *kvm)
 {
-	if (!css_general_characteristics.aiv)
-		kvm->arch.gisa = NULL;
-	else {
+	if (css_general_characteristics.aiv) {
 		kvm->arch.gisa = &kvm->arch.sie_page2->gisa;
 		VM_EVENT(kvm, 3, "gisa 0x%pK initialized", kvm->arch.gisa);
 		kvm_s390_gisa_clear(kvm);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 2c5e25b394352..f85a405e2595a 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2531,6 +2531,8 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->arch.sie_block->icpua = id;
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	vcpu->arch.sie_block->gd = (u32)(u64)kvm->arch.gisa;
+	if (vcpu->arch.sie_block->gd && sclp.has_gisaf)
+		vcpu->arch.sie_block->gd |= GISA_FORMAT1;
 	seqcount_init(&vcpu->arch.cputm_seqcount);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);

From a9e6d44ddeccd3522670e641f1ed9b068e746ff7 Mon Sep 17 00:00:00 2001
From: Sven Joachim <svenjoac@gmx.de>
Date: Fri, 26 Jan 2018 10:38:01 +0100
Subject: [PATCH 215/676] ssb: Do not disable PCI host on non-Mips

After upgrading an old laptop to 4.15-rc9, I found that the eth0 and
wlan0 interfaces had disappeared.  It turns out that the b43 and b44
drivers require SSB_PCIHOST_POSSIBLE which depends on
PCI_DRIVERS_LEGACY, a config option that only exists on Mips.

Fixes: 58eae1416b80 ("ssb: Disable PCI host for PCI_DRIVERS_GENERIC")
Cc: stable@vger.org
Signed-off-by: Sven Joachim <svenjoac@gmx.de>
Reviewed-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/ssb/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig
index 71c73766ee22e..65af12c3bdb2d 100644
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -32,7 +32,7 @@ config SSB_BLOCKIO
 
 config SSB_PCIHOST_POSSIBLE
 	bool
-	depends on SSB && (PCI = y || PCI = SSB) && PCI_DRIVERS_LEGACY
+	depends on SSB && (PCI = y || PCI = SSB) && (PCI_DRIVERS_LEGACY || !MIPS)
 	default y
 
 config SSB_PCIHOST

From 89a8f6d4904c8cf3ff8fee9fdaff392a6bbb8bf6 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:31 +0100
Subject: [PATCH 216/676] x86/hyperv: Check for required priviliges in
 hyperv_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In hyperv_init() its presumed that it always has access to VP index and
hypercall MSRs while according to the specification it should be checked if
it's allowed to access the corresponding MSRs before accessing them.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-2-vkuznets@redhat.com
---
 arch/x86/hyperv/hv_init.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 189a398290dbb..21f9d53d9f004 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -110,12 +110,19 @@ static int hv_cpu_init(unsigned int cpu)
  */
 void hyperv_init(void)
 {
-	u64 guest_id;
+	u64 guest_id, required_msrs;
 	union hv_x64_msr_hypercall_contents hypercall_msr;
 
 	if (x86_hyper_type != X86_HYPER_MS_HYPERV)
 		return;
 
+	/* Absolutely required MSRs */
+	required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE |
+		HV_X64_MSR_VP_INDEX_AVAILABLE;
+
+	if ((ms_hyperv.features & required_msrs) != required_msrs)
+		return;
+
 	/* Allocate percpu VP index */
 	hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
 				    GFP_KERNEL);

From e2768eaa1ca4fbb7b778da5615cce3dd310352e6 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:32 +0100
Subject: [PATCH 217/676] x86/hyperv: Add a function to read both TSC and TSC
 page value simulateneously
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is going to be used from KVM code where both TSC and TSC page value
are needed.

Nothing is supposed to use the function when Hyper-V code is compiled out,
just BUG().

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-3-vkuznets@redhat.com
---
 arch/x86/hyperv/hv_init.c       |  1 +
 arch/x86/include/asm/mshyperv.h | 23 +++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 21f9d53d9f004..1a6c63f721bc0 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -37,6 +37,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
 	return tsc_pg;
 }
+EXPORT_SYMBOL_GPL(hv_get_tsc_page);
 
 static u64 read_hv_clock_tsc(struct clocksource *arg)
 {
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 8bf450b13d9f5..6b1d4ea78270f 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -325,9 +325,10 @@ static inline void hyperv_setup_mmu_ops(void) {}
 
 #ifdef CONFIG_HYPERV_TSCPAGE
 struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+				       u64 *cur_tsc)
 {
-	u64 scale, offset, cur_tsc;
+	u64 scale, offset;
 	u32 sequence;
 
 	/*
@@ -358,7 +359,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
 
 		scale = READ_ONCE(tsc_pg->tsc_scale);
 		offset = READ_ONCE(tsc_pg->tsc_offset);
-		cur_tsc = rdtsc_ordered();
+		*cur_tsc = rdtsc_ordered();
 
 		/*
 		 * Make sure we read sequence after we read all other values
@@ -368,7 +369,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
 
 	} while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
 
-	return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
+	return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
+}
+
+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+	u64 cur_tsc;
+
+	return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
 }
 
 #else
@@ -376,5 +384,12 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
 {
 	return NULL;
 }
+
+static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+				       u64 *cur_tsc)
+{
+	BUG();
+	return U64_MAX;
+}
 #endif
 #endif

From 93286261de1b46339aa27cd4c639b21778f6cade Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:33 +0100
Subject: [PATCH 218/676] x86/hyperv: Reenlightenment notifications support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyper-V supports Live Migration notification. This is supposed to be used
in conjunction with TSC emulation: when a VM is migrated to a host with
different TSC frequency for some short period the host emulates the
accesses to TSC and sends an interrupt to notify about the event. When the
guest is done updating everything it can disable TSC emulation and
everything will start working fast again.

These notifications weren't required until now as Hyper-V guests are not
supposed to use TSC as a clocksource: in Linux the TSC is even marked as
unstable on boot. Guests normally use 'tsc page' clocksource and host
updates its values on migrations automatically.

Things change when with nested virtualization: even when the PV
clocksources (kvm-clock or tsc page) are passed through to the nested
guests the TSC frequency and frequency changes need to be know..

Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies
EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The
right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that
the fix in on the way.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-4-vkuznets@redhat.com
---
 arch/x86/entry/entry_32.S          |  3 +
 arch/x86/entry/entry_64.S          |  3 +
 arch/x86/hyperv/hv_init.c          | 89 ++++++++++++++++++++++++++++++
 arch/x86/include/asm/irq_vectors.h |  7 ++-
 arch/x86/include/asm/mshyperv.h    |  9 +++
 arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++
 arch/x86/kernel/cpu/mshyperv.c     |  6 ++
 7 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2a35b1e0fb902..7a796eeddf99c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -895,6 +895,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
 BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
 		 hyperv_vector_handler)
 
+BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
+		 hyperv_reenlightenment_intr)
+
 #endif /* CONFIG_HYPERV */
 
 ENTRY(page_fault)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a835704951629..553aa49909ce9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1245,6 +1245,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
 #if IS_ENABLED(CONFIG_HYPERV)
 apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
 	hyperv_callback_vector hyperv_vector_handler
+
+apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
+	hyperv_reenlightenment_vector hyperv_reenlightenment_intr
 #endif /* CONFIG_HYPERV */
 
 idtentry debug			do_debug		has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1a6c63f721bc0..712ac40081f7e 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -18,6 +18,8 @@
  */
 
 #include <linux/types.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
@@ -102,6 +104,93 @@ static int hv_cpu_init(unsigned int cpu)
 	return 0;
 }
 
+static void (*hv_reenlightenment_cb)(void);
+
+static void hv_reenlightenment_notify(struct work_struct *dummy)
+{
+	struct hv_tsc_emulation_status emu_status;
+
+	rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+	/* Don't issue the callback if TSC accesses are not emulated */
+	if (hv_reenlightenment_cb && emu_status.inprogress)
+		hv_reenlightenment_cb();
+}
+static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
+
+void hyperv_stop_tsc_emulation(void)
+{
+	u64 freq;
+	struct hv_tsc_emulation_status emu_status;
+
+	rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+	emu_status.inprogress = 0;
+	wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
+
+	rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);
+	tsc_khz = div64_u64(freq, 1000);
+}
+EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
+
+static inline bool hv_reenlightenment_available(void)
+{
+	/*
+	 * Check for required features and priviliges to make TSC frequency
+	 * change notifications work.
+	 */
+	return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
+		ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE &&
+		ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT;
+}
+
+__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs)
+{
+	entering_ack_irq();
+
+	schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
+
+	exiting_irq();
+}
+
+void set_hv_tscchange_cb(void (*cb)(void))
+{
+	struct hv_reenlightenment_control re_ctrl = {
+		.vector = HYPERV_REENLIGHTENMENT_VECTOR,
+		.enabled = 1,
+		.target_vp = hv_vp_index[smp_processor_id()]
+	};
+	struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
+
+	if (!hv_reenlightenment_available()) {
+		pr_warn("Hyper-V: reenlightenment support is unavailable\n");
+		return;
+	}
+
+	hv_reenlightenment_cb = cb;
+
+	/* Make sure callback is registered before we write to MSRs */
+	wmb();
+
+	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+	wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
+}
+EXPORT_SYMBOL_GPL(set_hv_tscchange_cb);
+
+void clear_hv_tscchange_cb(void)
+{
+	struct hv_reenlightenment_control re_ctrl;
+
+	if (!hv_reenlightenment_available())
+		return;
+
+	rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+	re_ctrl.enabled = 0;
+	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
+
+	hv_reenlightenment_cb = NULL;
+}
+EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb);
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 67421f649cfa1..e71c1120426bb 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -103,7 +103,12 @@
 #endif
 
 #define MANAGED_IRQ_SHUTDOWN_VECTOR	0xef
-#define LOCAL_TIMER_VECTOR		0xee
+
+#if IS_ENABLED(CONFIG_HYPERV)
+#define HYPERV_REENLIGHTENMENT_VECTOR	0xee
+#endif
+
+#define LOCAL_TIMER_VECTOR		0xed
 
 #define NR_VECTORS			 256
 
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 6b1d4ea78270f..1790002a2052d 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
 #define hv_set_synint_state(int_num, val) wrmsrl(int_num, val)
 
 void hyperv_callback_vector(void);
+void hyperv_reenlightenment_vector(void);
 #ifdef CONFIG_TRACING
 #define trace_hyperv_callback_vector hyperv_callback_vector
 #endif
@@ -316,11 +317,19 @@ void hyper_alloc_mmu(void);
 void hyperv_report_panic(struct pt_regs *regs, long err);
 bool hv_is_hypercall_page_setup(void);
 void hyperv_cleanup(void);
+
+void hyperv_reenlightenment_intr(struct pt_regs *regs);
+void set_hv_tscchange_cb(void (*cb)(void));
+void clear_hv_tscchange_cb(void);
+void hyperv_stop_tsc_emulation(void);
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline bool hv_is_hypercall_page_setup(void) { return false; }
 static inline void hyperv_cleanup(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
+static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
+static inline void clear_hv_tscchange_cb(void) {}
+static inline void hyperv_stop_tsc_emulation(void) {};
 #endif /* CONFIG_HYPERV */
 
 #ifdef CONFIG_HYPERV_TSCPAGE
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 1a5bfead93b41..197c2e6c73765 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -40,6 +40,9 @@
  */
 #define HV_X64_ACCESS_FREQUENCY_MSRS		(1 << 11)
 
+/* AccessReenlightenmentControls privilege */
+#define HV_X64_ACCESS_REENLIGHTENMENT		BIT(13)
+
 /*
  * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
  * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
@@ -234,6 +237,30 @@
 #define HV_X64_MSR_CRASH_PARAMS		\
 		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
 
+/* TSC emulation after migration */
+#define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
+
+struct hv_reenlightenment_control {
+	u64 vector:8;
+	u64 reserved1:8;
+	u64 enabled:1;
+	u64 reserved2:15;
+	u64 target_vp:32;
+};
+
+#define HV_X64_MSR_TSC_EMULATION_CONTROL	0x40000107
+#define HV_X64_MSR_TSC_EMULATION_STATUS		0x40000108
+
+struct hv_tsc_emulation_control {
+	u64 enabled:1;
+	u64 reserved:63;
+};
+
+struct hv_tsc_emulation_status {
+	u64 inprogress:1;
+	u64 reserved:63;
+};
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 85eb5fc180c81..9340f41ce8d3d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void)
 	hyperv_setup_mmu_ops();
 	/* Setup the IDT for hypervisor callback */
 	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+
+	/* Setup the IDT for reenlightenment notifications */
+	if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
+		alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
+				hyperv_reenlightenment_vector);
+
 #endif
 }
 

From e7c4e36c447daca2b7df49024f6bf230871cb155 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:34 +0100
Subject: [PATCH 219/676] x86/hyperv: Redirect reenlightment notifications on
 CPU offlining
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is very unlikely for CPUs to get offlined when running on Hyper-V as
there is a protection in the vmbus module which prevents it when the guest
has any VMBus devices assigned. This, however, may change in future if an
option to reassign an already active channel will be added. It is also
possible to run without any Hyper-V devices or to have a CPU with no
assigned channels.

Reassign reenlightenment notifications to some other active CPU when the
CPU which is assigned to them goes offline.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-5-vkuznets@redhat.com
---
 arch/x86/hyperv/hv_init.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 712ac40081f7e..e4377e2f2a10b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -191,6 +191,26 @@ void clear_hv_tscchange_cb(void)
 }
 EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb);
 
+static int hv_cpu_die(unsigned int cpu)
+{
+	struct hv_reenlightenment_control re_ctrl;
+	unsigned int new_cpu;
+
+	if (hv_reenlightenment_cb == NULL)
+		return 0;
+
+	rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+	if (re_ctrl.target_vp == hv_vp_index[cpu]) {
+		/* Reassign to some other online CPU */
+		new_cpu = cpumask_any_but(cpu_online_mask, cpu);
+
+		re_ctrl.target_vp = hv_vp_index[new_cpu];
+		wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
+	}
+
+	return 0;
+}
+
 /*
  * This function is to be invoked early in the boot sequence after the
  * hypervisor has been detected.
@@ -220,7 +240,7 @@ void hyperv_init(void)
 		return;
 
 	if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
-			      hv_cpu_init, NULL) < 0)
+			      hv_cpu_init, hv_cpu_die) < 0)
 		goto free_vp_index;
 
 	/*

From 51d4e5daa32808df4d50db511d167fde19fa114e Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:35 +0100
Subject: [PATCH 220/676] x86/irq: Count Hyper-V reenlightenment interrupts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyper-V reenlightenment interrupts arrive when the VM is migrated, While
they are not interesting in general it's important when L2 nested guests
are running.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-6-vkuznets@redhat.com
---
 arch/x86/hyperv/hv_init.c      | 2 ++
 arch/x86/include/asm/hardirq.h | 3 +++
 arch/x86/kernel/irq.c          | 9 +++++++++
 3 files changed, 14 insertions(+)

diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index e4377e2f2a10b..a3adece392f14 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -147,6 +147,8 @@ __visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs)
 {
 	entering_ack_irq();
 
+	inc_irq_stat(irq_hv_reenlightenment_count);
+
 	schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
 
 	exiting_irq();
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 51cc979dd3642..7c341a74ec8c4 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -38,6 +38,9 @@ typedef struct {
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
 	unsigned int irq_hv_callback_count;
 #endif
+#if IS_ENABLED(CONFIG_HYPERV)
+	unsigned int irq_hv_reenlightenment_count;
+#endif
 } ____cacheline_aligned irq_cpustat_t;
 
 DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 68e1867cca804..45fb4d2565f80 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 				   irq_stats(j)->irq_hv_callback_count);
 		seq_puts(p, "  Hypervisor callback interrupts\n");
 	}
+#endif
+#if IS_ENABLED(CONFIG_HYPERV)
+	if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) {
+		seq_printf(p, "%*s: ", prec, "HRE");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				   irq_stats(j)->irq_hv_reenlightenment_count);
+		seq_puts(p, "  Hyper-V reenlightenment interrupts\n");
+	}
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)

From b0c39dc68e3b3d22bf9d2984f62f6c86788a49e7 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:36 +0100
Subject: [PATCH 221/676] x86/kvm: Pass stable clocksource to guests when
 running nested on Hyper-V
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, KVM is able to work in 'masterclock' mode passing
PVCLOCK_TSC_STABLE_BIT to guests when the clocksource which is used on the
host is TSC.

When running nested on Hyper-V the guest normally uses a different one: TSC
page which is resistant to TSC frequency changes on events like L1
migration. Add support for it in KVM.

The only non-trivial change is in vgettsc(): when updating the gtod copy
both the clock readout and tsc value have to be updated now.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-7-vkuznets@redhat.com
---
 arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c53298dfbf50a..b1ce368a07af3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -67,6 +67,7 @@
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
+#include <asm/mshyperv.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -1377,6 +1378,11 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
+static inline int gtod_is_based_on_tsc(int mode)
+{
+	return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+}
+
 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
@@ -1396,7 +1402,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 	 * perform request to enable masterclock.
 	 */
 	if (ka->use_master_clock ||
-	    (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
+	    (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
@@ -1459,6 +1465,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	vcpu->arch.tsc_offset = offset;
 }
 
+static inline bool kvm_check_tsc_unstable(void)
+{
+#ifdef CONFIG_X86_64
+	/*
+	 * TSC is marked unstable when we're running on Hyper-V,
+	 * 'TSC page' clocksource is good.
+	 */
+	if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+		return false;
+#endif
+	return check_tsc_unstable();
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
@@ -1504,7 +1523,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
          */
 	if (synchronizing &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
-		if (!check_tsc_unstable()) {
+		if (!kvm_check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
 			pr_debug("kvm: matched tsc offset for %llu\n", data);
 		} else {
@@ -1604,18 +1623,43 @@ static u64 read_tsc(void)
 	return last;
 }
 
-static inline u64 vgettsc(u64 *cycle_now)
+static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
 {
 	long v;
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+	u64 tsc_pg_val;
+
+	switch (gtod->clock.vclock_mode) {
+	case VCLOCK_HVCLOCK:
+		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
+						  tsc_timestamp);
+		if (tsc_pg_val != U64_MAX) {
+			/* TSC page valid */
+			*mode = VCLOCK_HVCLOCK;
+			v = (tsc_pg_val - gtod->clock.cycle_last) &
+				gtod->clock.mask;
+		} else {
+			/* TSC page invalid */
+			*mode = VCLOCK_NONE;
+		}
+		break;
+	case VCLOCK_TSC:
+		*mode = VCLOCK_TSC;
+		*tsc_timestamp = read_tsc();
+		v = (*tsc_timestamp - gtod->clock.cycle_last) &
+			gtod->clock.mask;
+		break;
+	default:
+		*mode = VCLOCK_NONE;
+	}
 
-	*cycle_now = read_tsc();
+	if (*mode == VCLOCK_NONE)
+		*tsc_timestamp = v = 0;
 
-	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
 	return v * gtod->clock.mult;
 }
 
-static int do_monotonic_boot(s64 *t, u64 *cycle_now)
+static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
@@ -1624,9 +1668,8 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
 		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
+		ns += vgettsc(tsc_timestamp, &mode);
 		ns >>= gtod->clock.shift;
 		ns += gtod->boot_ns;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -1635,7 +1678,7 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 	return mode;
 }
 
-static int do_realtime(struct timespec *ts, u64 *cycle_now)
+static int do_realtime(struct timespec *ts, u64 *tsc_timestamp)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
@@ -1644,10 +1687,9 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now)
 
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
 		ts->tv_sec = gtod->wall_time_sec;
 		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
+		ns += vgettsc(tsc_timestamp, &mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
@@ -1657,25 +1699,26 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now)
 	return mode;
 }
 
-/* returns true if host is using tsc clocksource */
-static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
+/* returns true if host is using TSC based clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
 {
 	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 		return false;
 
-	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+	return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+						      tsc_timestamp));
 }
 
-/* returns true if host is using tsc clocksource */
+/* returns true if host is using TSC based clocksource */
 static bool kvm_get_walltime_and_clockread(struct timespec *ts,
-					   u64 *cycle_now)
+					   u64 *tsc_timestamp)
 {
 	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 		return false;
 
-	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
 }
 #endif
 
@@ -2869,13 +2912,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
 	}
 
-	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
+	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
 				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 
-		if (check_tsc_unstable()) {
+		if (kvm_check_tsc_unstable()) {
 			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
 			kvm_vcpu_write_tsc_offset(vcpu, offset);
@@ -6110,9 +6153,9 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
 	update_pvclock_gtod(tk);
 
 	/* disable master clock if host does not trust, or does not
-	 * use, TSC clocksource
+	 * use, TSC based clocksource.
 	 */
-	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
 	    atomic_read(&kvm_guest_has_master_clock) != 0)
 		queue_work(system_long_wq, &pvclock_gtod_work);
 
@@ -7767,7 +7810,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 {
 	struct kvm_vcpu *vcpu;
 
-	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
 		printk_once(KERN_WARNING
 		"kvm: SMP vm created on host with unstable TSC; "
 		"guest TSC will not be reliable\n");
@@ -7924,7 +7967,7 @@ int kvm_arch_hardware_enable(void)
 		return ret;
 
 	local_tsc = rdtsc();
-	stable = !check_tsc_unstable();
+	stable = !kvm_check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			if (!stable && vcpu->cpu == smp_processor_id())

From 0092e4346f49558e5fe5a927c6d78d401dc4ed73 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Wed, 24 Jan 2018 14:23:37 +0100
Subject: [PATCH 222/676] x86/kvm: Support Hyper-V reenlightenment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When running nested KVM on Hyper-V guests its required to update
masterclocks for all guests when L1 migrates to a host with different TSC
frequency.

Implement the procedure in the following way:
  - Pause all guests.
  - Tell the host (Hyper-V) to stop emulating TSC accesses.
  - Update the gtod copy, recompute clocks.
  - Unpause all guests.

This is somewhat similar to cpufreq but there are two important differences:
 - TSC emulation can only be disabled globally (on all CPUs)
 - The new TSC frequency is not known until emulation is turned off so
   there is no way to 'prepare' for the event upfront.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
Link: https://lkml.kernel.org/r/20180124132337.30138-8-vkuznets@redhat.com
---
 arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b1ce368a07af3..879a999874016 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -68,6 +68,7 @@
 #include <asm/div64.h>
 #include <asm/irq_remapping.h>
 #include <asm/mshyperv.h>
+#include <asm/hypervisor.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -5932,6 +5933,43 @@ static void tsc_khz_changed(void *data)
 	__this_cpu_write(cpu_tsc_khz, khz);
 }
 
+static void kvm_hyperv_tsc_notifier(void)
+{
+#ifdef CONFIG_X86_64
+	struct kvm *kvm;
+	struct kvm_vcpu *vcpu;
+	int cpu;
+
+	spin_lock(&kvm_lock);
+	list_for_each_entry(kvm, &vm_list, vm_list)
+		kvm_make_mclock_inprogress_request(kvm);
+
+	hyperv_stop_tsc_emulation();
+
+	/* TSC frequency always matches when on Hyper-V */
+	for_each_present_cpu(cpu)
+		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+	kvm_max_guest_tsc_khz = tsc_khz;
+
+	list_for_each_entry(kvm, &vm_list, vm_list) {
+		struct kvm_arch *ka = &kvm->arch;
+
+		spin_lock(&ka->pvclock_gtod_sync_lock);
+
+		pvclock_update_vm_gtod_copy(kvm);
+
+		kvm_for_each_vcpu(cpu, vcpu, kvm)
+			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+
+		kvm_for_each_vcpu(cpu, vcpu, kvm)
+			kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+
+		spin_unlock(&ka->pvclock_gtod_sync_lock);
+	}
+	spin_unlock(&kvm_lock);
+#endif
+}
+
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				     void *data)
 {
@@ -6217,6 +6255,9 @@ int kvm_arch_init(void *opaque)
 	kvm_lapic_init();
 #ifdef CONFIG_X86_64
 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+
+	if (x86_hyper_type == X86_HYPER_MS_HYPERV)
+		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
 #endif
 
 	return 0;
@@ -6229,6 +6270,10 @@ int kvm_arch_init(void *opaque)
 
 void kvm_arch_exit(void)
 {
+#ifdef CONFIG_X86_64
+	if (x86_hyper_type == X86_HYPER_MS_HYPERV)
+		clear_hv_tscchange_cb();
+#endif
 	kvm_lapic_exit();
 	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
 

From 2fc616c06e125baed9e8d0cd166a4e58468939c8 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Fri, 12 Jan 2018 15:53:34 +0100
Subject: [PATCH 223/676] MAINTAINERS: add David as a reviewer for KVM/s390

Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 82ad0eabce4f3..cd7a416c516b2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7693,6 +7693,7 @@ F:	arch/powerpc/kernel/kvm*
 KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
 M:	Christian Borntraeger <borntraeger@de.ibm.com>
 M:	Cornelia Huck <cohuck@redhat.com>
+R:	David Hildenbrand <david@redhat.com>
 L:	linux-s390@vger.kernel.org
 W:	http://www.ibm.com/developerworks/linux/linux390/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git

From 78269f3418e5c25e1426a21378952f6f7b78ad1f Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Fri, 12 Jan 2018 15:53:34 +0100
Subject: [PATCH 224/676] MAINTAINERS: add Halil as additional vfio-ccw
 maintainer

Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Halil Pasic <pasic@linux.vnet.ibm.com>
Acked-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cd7a416c516b2..990ac01c319da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11863,6 +11863,7 @@ F:	drivers/pci/hotplug/s390_pci_hpc.c
 S390 VFIO-CCW DRIVER
 M:	Cornelia Huck <cohuck@redhat.com>
 M:	Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
+M:	Halil Pasic <pasic@linux.vnet.ibm.com>
 L:	linux-s390@vger.kernel.org
 L:	kvm@vger.kernel.org
 S:	Supported

From cd74ff94524e138e72e9feb5bf5b30ce28ba2311 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Fri, 12 Jan 2018 15:53:34 +0100
Subject: [PATCH 225/676] MAINTAINERS: update KVM/s390 maintainers

As I have neither too much time nor access to the architecture
documentation anymore, let's switch my status from maintainer to
reviewer. Janosch will step in as second maintainer.

Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 990ac01c319da..71781ad422f0a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7692,8 +7692,9 @@ F:	arch/powerpc/kernel/kvm*
 
 KERNEL VIRTUAL MACHINE for s390 (KVM/s390)
 M:	Christian Borntraeger <borntraeger@de.ibm.com>
-M:	Cornelia Huck <cohuck@redhat.com>
+M:	Janosch Frank <frankja@linux.vnet.ibm.com>
 R:	David Hildenbrand <david@redhat.com>
+R:	Cornelia Huck <cohuck@redhat.com>
 L:	linux-s390@vger.kernel.org
 W:	http://www.ibm.com/developerworks/linux/linux390/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git

From 13e59ece5b30f39e4e1e1fac2b2ddc7ed527f3cc Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 25 Jan 2018 14:20:19 +0100
Subject: [PATCH 226/676] KVM: arm/arm64: Fix incorrect timer_is_pending logic

After the recently introduced support for level-triggered mapped
interrupt, I accidentally left the VCPU thread busily going back and
forward between the guest and the hypervisor whenever the guest was
blocking, because I would always incorrectly report that a timer
interrupt was pending.

This is because the timer->irq.level field is not valid for mapped
interrupts, where we offload the level state to the hardware, and as a
result this field is always true.

Luckily the problem can be relatively easily solved by not checking the
cached signal state of either timer in kvm_timer_should_fire() but
instead compute the timer state on the fly, which we do already if the
cached signal state wasn't high.  In fact, the only reason for checking
the cached signal state was a tiny optimization which would only be
potentially faster when the polling loop detects a pending timer
interrupt, which is quite unlikely.

Instead of duplicating the logic from kvm_arch_timer_handler(), we
enlighten kvm_timer_should_fire() to report something valid when the
timer state is loaded onto the hardware.  We can then call this from
kvm_arch_timer_handler() as well and avoid the call to
__timer_snapshot_state() in kvm_arch_timer_get_input_level().

Reported-by: Tomasz Nowicki <tn@semihalf.com>
Tested-by: Tomasz Nowicki <tn@semihalf.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index cfcd0323deabe..63cf828f3c4f8 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -97,10 +97,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 		pr_warn_once("Spurious arch timer IRQ on non-VCPU thread\n");
 		return IRQ_NONE;
 	}
-	vtimer = vcpu_vtimer(vcpu);
 
-	vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
-	if (kvm_timer_irq_can_fire(vtimer))
+	vtimer = vcpu_vtimer(vcpu);
+	if (kvm_timer_should_fire(vtimer))
 		kvm_timer_update_irq(vcpu, true, vtimer);
 
 	if (static_branch_unlikely(&userspace_irqchip_in_use) &&
@@ -230,6 +229,16 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
 {
 	u64 cval, now;
 
+	if (timer_ctx->loaded) {
+		u32 cnt_ctl;
+
+		/* Only the virtual timer can be loaded so far */
+		cnt_ctl = read_sysreg_el0(cntv_ctl);
+		return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
+		        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
+		       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
+	}
+
 	if (!kvm_timer_irq_can_fire(timer_ctx))
 		return false;
 
@@ -244,15 +253,7 @@ bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
 	struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
 	struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
 
-	if (vtimer->irq.level || ptimer->irq.level)
-		return true;
-
-	/*
-	 * When this is called from withing the wait loop of kvm_vcpu_block(),
-	 * the software view of the timer state is up to date (timer->loaded
-	 * is false), and so we can simply check if the timer should fire now.
-	 */
-	if (!vtimer->loaded && kvm_timer_should_fire(vtimer))
+	if (kvm_timer_should_fire(vtimer))
 		return true;
 
 	return kvm_timer_should_fire(ptimer);
@@ -270,9 +271,9 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu)
 	/* Populate the device bitmap with the timer states */
 	regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
 				    KVM_ARM_DEV_EL1_PTIMER);
-	if (vtimer->irq.level)
+	if (kvm_timer_should_fire(vtimer))
 		regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
-	if (ptimer->irq.level)
+	if (kvm_timer_should_fire(ptimer))
 		regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
 }
 
@@ -507,8 +508,8 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
 	vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
 	plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
 
-	return vtimer->irq.level != vlevel ||
-	       ptimer->irq.level != plevel;
+	return kvm_timer_should_fire(vtimer) != vlevel ||
+	       kvm_timer_should_fire(ptimer) != plevel;
 }
 
 void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
@@ -801,9 +802,6 @@ bool kvm_arch_timer_get_input_level(int vintid)
 	else
 		BUG(); /* We only map the vtimer so far */
 
-	if (timer->loaded)
-		__timer_snapshot_state(timer);
-
 	return kvm_timer_should_fire(timer);
 }
 

From f1d7231cede93a42d75b6d3c5ca599e94a273e89 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 25 Jan 2018 18:32:29 +0100
Subject: [PATCH 227/676] KVM: arm/arm64: Fix userspace_irqchip_in_use counting

We were not decrementing the static key count in the right location.
kvm_arch_vcpu_destroy() is only called to clean up after a failed
VCPU create attempt, whereas kvm_arch_vcpu_free() is called on teardown
of the VM as well.  Move the static key decrement call to
kvm_arch_vcpu_free().

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 639dca0c05609..04ee7a3278706 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -295,6 +295,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
+	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+		static_branch_dec(&userspace_irqchip_in_use);
+
 	kvm_mmu_free_memory_caches(vcpu);
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);
@@ -304,8 +307,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		static_branch_dec(&userspace_irqchip_in_use);
 	kvm_arch_vcpu_free(vcpu);
 }
 

From cd15d2050c044ca9525ba165e9073ac8e036b8d0 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Fri, 26 Jan 2018 16:20:22 +0100
Subject: [PATCH 228/676] KVM: arm/arm64: Fixup userspace irqchip static key
 optimization

When I introduced a static key to avoid work in the critical path for
userspace irqchips which is very rarely used, I accidentally messed up
my logic and used && where I should have used ||, because the point was
to short-circuit the evaluation in case userspace irqchips weren't even
in use.

This fixes an issue when running in-kernel irqchip VMs alongside
userspace irqchip VMs.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Fixes: c44c232ee2d3 ("KVM: arm/arm64: Avoid work when userspace iqchips are not used")
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 virt/kvm/arm/arch_timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 63cf828f3c4f8..fb6bd9b9845ea 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -286,7 +286,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
 	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
 				   timer_ctx->irq.level);
 
-	if (!static_branch_unlikely(&userspace_irqchip_in_use) &&
+	if (!static_branch_unlikely(&userspace_irqchip_in_use) ||
 	    likely(irqchip_in_kernel(vcpu->kvm))) {
 		ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
 					  timer_ctx->irq.irq,

From 5fa4ec9cb2e6679e2f828033726f758ea314b9c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 Jan 2018 09:41:40 +0100
Subject: [PATCH 229/676] x86/kvm: Make it compile on 32bit and with
 HYPYERVISOR_GUEST=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The reenlightment support for hyperv slapped a direct reference to
x86_hyper_type into the kvm code which results in the following build
failure when CONFIG_HYPERVISOR_GUEST=n:

arch/x86/kvm/x86.c:6259:6: error: ‘x86_hyper_type’ undeclared (first use in this function)
arch/x86/kvm/x86.c:6259:6: note: each undeclared identifier is reported only once for each function it appears in

Use the proper helper function to cure that.

The 32bit compile fails because of:

arch/x86/kvm/x86.c:5936:13: warning: ‘kvm_hyperv_tsc_notifier’ defined but not used [-Wunused-function]

which is a real trainwreck engineering artwork. The callsite is wrapped
into #ifdef CONFIG_X86_64, but the function itself has the #ifdef inside
the function body. Make the function itself wrapped into the ifdef to cure
that.

Qualiteee....

Fixes: 0092e4346f49 ("x86/kvm: Support Hyper-V reenlightenment")
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "Michael Kelley (EOSG)" <Michael.H.Kelley@microsoft.com>
Cc: Roman Kagan <rkagan@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: devel@linuxdriverproject.org
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Cathy Avery <cavery@redhat.com>
Cc: Mohammed Gamal <mmorsy@redhat.com>
---
 arch/x86/kvm/x86.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 879a999874016..cd3b3bc67c5a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5933,9 +5933,9 @@ static void tsc_khz_changed(void *data)
 	__this_cpu_write(cpu_tsc_khz, khz);
 }
 
+#ifdef CONFIG_X86_64
 static void kvm_hyperv_tsc_notifier(void)
 {
-#ifdef CONFIG_X86_64
 	struct kvm *kvm;
 	struct kvm_vcpu *vcpu;
 	int cpu;
@@ -5967,8 +5967,8 @@ static void kvm_hyperv_tsc_notifier(void)
 		spin_unlock(&ka->pvclock_gtod_sync_lock);
 	}
 	spin_unlock(&kvm_lock);
-#endif
 }
+#endif
 
 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 				     void *data)
@@ -6256,7 +6256,7 @@ int kvm_arch_init(void *opaque)
 #ifdef CONFIG_X86_64
 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
 
-	if (x86_hyper_type == X86_HYPER_MS_HYPERV)
+	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
 		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
 #endif
 
@@ -6271,7 +6271,7 @@ int kvm_arch_init(void *opaque)
 void kvm_arch_exit(void)
 {
 #ifdef CONFIG_X86_64
-	if (x86_hyper_type == X86_HYPER_MS_HYPERV)
+	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
 		clear_hv_tscchange_cb();
 #endif
 	kvm_lapic_exit();

From a340b3e229b24a56f1c7f5826b15a3af0f4b13e5 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Wed, 17 Jan 2018 19:18:56 +0100
Subject: [PATCH 230/676] kvm: Map PFN-type memory regions as writable (if
 possible)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For EPT-violations that are triggered by a read, the pages are also mapped with
write permissions (if their memory region is also writable). That would avoid
getting yet another fault on the same page when a write occurs.

This optimization only happens when you have a "struct page" backing the memory
region. So also enable it for memory regions that do not have a "struct page".

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/kvm_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b4414842b0239..8af42eab126dc 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1428,7 +1428,8 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 
 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 			       unsigned long addr, bool *async,
-			       bool write_fault, kvm_pfn_t *p_pfn)
+			       bool write_fault, bool *writable,
+			       kvm_pfn_t *p_pfn)
 {
 	unsigned long pfn;
 	int r;
@@ -1454,6 +1455,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 
 	}
 
+	if (writable)
+		*writable = true;
 
 	/*
 	 * Get a reference here because callers of *hva_to_pfn* and
@@ -1519,7 +1522,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	if (vma == NULL)
 		pfn = KVM_PFN_ERR_FAULT;
 	else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
-		r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+		r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
 		if (r == -EAGAIN)
 			goto retry;
 		if (r < 0)

From e46b469278a59781f9b25ff608af84892963821b Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Sat, 20 Jan 2018 04:04:22 +0900
Subject: [PATCH 231/676] kvm: embed vcpu id to dentry of vcpu anon inode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All d-entries for vcpu have the same, "anon_inode:kvm-vcpu". That means
it is impossible to know the mapping between fds for vcpu and vcpu
from userland.

    # LC_ALL=C ls -l /proc/617/fd | grep vcpu
    lrwx------. 1 qemu qemu 64 Jan  7 16:50 18 -> anon_inode:kvm-vcpu
    lrwx------. 1 qemu qemu 64 Jan  7 16:50 19 -> anon_inode:kvm-vcpu

It is also impossible to know the mapping between vma for kvm_run
structure and vcpu from userland.

    # LC_ALL=C grep vcpu /proc/617/maps
    7f9d842d0000-7f9d842d3000 rw-s 00000000 00:0d 20393                      anon_inode:kvm-vcpu
    7f9d842d3000-7f9d842d6000 rw-s 00000000 00:0d 20393                      anon_inode:kvm-vcpu

This change adds vcpu id to d-entries for vcpu. With this change
you can get the following output:

    # LC_ALL=C ls -l /proc/617/fd | grep vcpu
    lrwx------. 1 qemu qemu 64 Jan  7 16:50 18 -> anon_inode:kvm-vcpu:0
    lrwx------. 1 qemu qemu 64 Jan  7 16:50 19 -> anon_inode:kvm-vcpu:1

    # LC_ALL=C grep vcpu /proc/617/maps
    7f9d842d0000-7f9d842d3000 rw-s 00000000 00:0d 20393                      anon_inode:kvm-vcpu:0
    7f9d842d3000-7f9d842d6000 rw-s 00000000 00:0d 20393                      anon_inode:kvm-vcpu:1

With the mappings known from the output, a tool like strace can report more details
of qemu-kvm process activities. Here is the strace output of my local prototype:

    # ./strace -KK -f -p 617 2>&1 | grep 'KVM_RUN\| K'
    ...
    [pid   664] ioctl(18, KVM_RUN, 0)       = 0 (KVM_EXIT_MMIO)
     K ready_for_interrupt_injection=1, if_flag=0, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
     K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
    [pid   664] ioctl(18, KVM_RUN, 0)       = 0 (KVM_EXIT_MMIO)
     K ready_for_interrupt_injection=1, if_flag=1, flags=0, cr8=0000000000000000, apic_base=0x000000fee00d00
     K phys_addr=0, len=1634035803, [33, 0, 0, 0, 0, 0, 0, 0], is_write=112
    ...

Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/kvm_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8af42eab126dc..8a937b7cde357 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2415,7 +2415,10 @@ static struct file_operations kvm_vcpu_fops = {
  */
 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
 {
-	return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
+	char name[8 + 1 + ITOA_MAX_LEN + 1];
+
+	snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
+	return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
 }
 
 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)

From d391f1207067268261add0485f0f34503539c5b0 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 25 Jan 2018 16:37:07 +0100
Subject: [PATCH 232/676] x86/kvm/vmx: do not use vm-exit instruction length
 for fast MMIO when running nested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I was investigating an issue with seabios >= 1.10 which stopped working
for nested KVM on Hyper-V. The problem appears to be in
handle_ept_violation() function: when we do fast mmio we need to skip
the instruction so we do kvm_skip_emulated_instruction(). This, however,
depends on VM_EXIT_INSTRUCTION_LEN field being set correctly in VMCS.
However, this is not the case.

Intel's manual doesn't mandate VM_EXIT_INSTRUCTION_LEN to be set when
EPT MISCONFIG occurs. While on real hardware it was observed to be set,
some hypervisors follow the spec and don't set it; we end up advancing
IP with some random value.

I checked with Microsoft and they confirmed they don't fill
VM_EXIT_INSTRUCTION_LEN on EPT MISCONFIG.

Fix the issue by doing instruction skip through emulator when running
nested.

Fixes: 68c3b4d1676d870f0453c31d5a52e7e65c7448ae
Suggested-by: Radim Krčmář <rkrcmar@redhat.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 +++++++++++++++-
 arch/x86/kvm/x86.c |  3 ++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1e2ca9e8662ff..438802d0b01d8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6577,7 +6577,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	if (!is_guest_mode(vcpu) &&
 	    !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		trace_kvm_fast_mmio(gpa);
-		return kvm_skip_emulated_instruction(vcpu);
+		/*
+		 * Doing kvm_skip_emulated_instruction() depends on undefined
+		 * behavior: Intel's manual doesn't mandate
+		 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
+		 * occurs and while on real hardware it was observed to be set,
+		 * other hypervisors (namely Hyper-V) don't set it, we end up
+		 * advancing IP with some random value. Disable fast mmio when
+		 * running nested and keep it for real hardware in hope that
+		 * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
+		 */
+		if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+			return kvm_skip_emulated_instruction(vcpu);
+		else
+			return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP,
+						       NULL, 0) == EMULATE_DONE;
 	}
 
 	ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0204b2b8a293f..01571102b50c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5773,7 +5773,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		 * handle watchpoints yet, those would be handled in
 		 * the emulate_ops.
 		 */
-		if (kvm_vcpu_check_breakpoint(vcpu, &r))
+		if (!(emulation_type & EMULTYPE_SKIP) &&
+		    kvm_vcpu_check_breakpoint(vcpu, &r))
 			return r;
 
 		ctxt->interruptibility = 0;

From 806793f5f76eb9c000ed05e307a8f7e9450b3291 Mon Sep 17 00:00:00 2001
From: Stanislav Lanci <pixo@polepetko.eu>
Date: Mon, 29 Jan 2018 11:39:44 -0500
Subject: [PATCH 233/676] KVM: x86: AMD Processor Topology Information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch allow to enable x86 feature TOPOEXT. This is needed to provide
information about SMT on AMD Zen CPUs to the guest.

Signed-off-by: Stanislav Lanci <pixo@polepetko.eu>
Tested-by: Nick Sarnie <commendsarnex@gmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Babu Moger <babu.moger@amd.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ac0041c2f5afe..20e491b94f44e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -371,7 +371,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
+		F(TOPOEXT);
 
 	/* cpuid 0xC0000001.edx */
 	const u32 kvm_cpuid_C000_0001_edx_x86_features =

From 87cedc6be55954c6efd6eca2e694132513f65a2a Mon Sep 17 00:00:00 2001
From: "Longpeng(Mike)" <longpeng2@huawei.com>
Date: Fri, 26 Jan 2018 17:34:08 +0800
Subject: [PATCH 234/676] kvm: x86: remove efer_reload entry in kvm_vcpu_stat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The efer_reload is never used since
commit 26bb0981b3ff ("KVM: VMX: Use shared msr infrastructure"),
so remove it.

Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/x86.c              | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea7e40e9c1f0f..dd6f57a54a262 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -895,7 +895,6 @@ struct kvm_vcpu_stat {
 	u64 request_irq_exits;
 	u64 irq_exits;
 	u64 host_state_reload;
-	u64 efer_reload;
 	u64 fpu_reload;
 	u64 insn_emulation;
 	u64 insn_emulation_fail;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 01571102b50c9..c13cd14c47809 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -177,7 +177,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
-	{ "efer_reload", VCPU_STAT(efer_reload) },
 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },

From 36ee41d161c67a6fcf696d4817a0da31f778938c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 30 Jan 2018 10:51:32 +1100
Subject: [PATCH 235/676] KVM: PPC: Book3S HV: Drop locks before reading guest
 memory

Running with CONFIG_DEBUG_ATOMIC_SLEEP reveals that HV KVM tries to
read guest memory, in order to emulate guest instructions, while
preempt is disabled and a vcore lock is held.  This occurs in
kvmppc_handle_exit_hv(), called from post_guest_process(), when
emulating guest doorbell instructions on POWER9 systems, and also
when checking whether we have hit a hypervisor breakpoint.
Reading guest memory can cause a page fault and thus cause the
task to sleep, so we need to avoid reading guest memory while
holding a spinlock or when preempt is disabled.

To fix this, we move the preempt_enable() in kvmppc_run_core() to
before the loop that calls post_guest_process() for each vcore that
has just run, and we drop and re-take the vcore lock around the calls
to kvmppc_emulate_debug_inst() and kvmppc_emulate_doorbell_instr().

Dropping the lock is safe with respect to the iteration over the
runnable vcpus in post_guest_process(); for_each_runnable_thread
is actually safe to use locklessly.  It is possible for a vcpu
to become runnable and add itself to the runnable_threads array
(code near the beginning of kvmppc_run_vcpu()) and then get included
in the iteration in post_guest_process despite the fact that it
has not just run.  This is benign because vcpu->arch.trap and
vcpu->arch.ceded will be zero.

Cc: stable@vger.kernel.org # v4.13+
Fixes: 579006944e0d ("KVM: PPC: Book3S HV: Virtualize doorbell facility on POWER9")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e5f81fc108e09..aa6130b56b5ee 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1008,8 +1008,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_vcpu *tvcpu;
 
-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
-		return EMULATE_FAIL;
 	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
 		return RESUME_GUEST;
 	if (get_op(inst) != 31)
@@ -1059,6 +1057,7 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
 	return RESUME_GUEST;
 }
 
+/* Called with vcpu->arch.vcore->lock held */
 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				 struct task_struct *tsk)
 {
@@ -1179,7 +1178,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				swab32(vcpu->arch.emul_inst) :
 				vcpu->arch.emul_inst;
 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
+			/* Need vcore unlocked to call kvmppc_get_last_inst */
+			spin_unlock(&vcpu->arch.vcore->lock);
 			r = kvmppc_emulate_debug_inst(run, vcpu);
+			spin_lock(&vcpu->arch.vcore->lock);
 		} else {
 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 			r = RESUME_GUEST;
@@ -1194,8 +1196,13 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 */
 	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
 		r = EMULATE_FAIL;
-		if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
+		if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
+		    cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/* Need vcore unlocked to call kvmppc_get_last_inst */
+			spin_unlock(&vcpu->arch.vcore->lock);
 			r = kvmppc_emulate_doorbell_instr(vcpu);
+			spin_lock(&vcpu->arch.vcore->lock);
+		}
 		if (r == EMULATE_FAIL) {
 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 			r = RESUME_GUEST;
@@ -2946,13 +2953,14 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
 
+	preempt_enable();
+
 	for (sub = 0; sub < core_info.n_subcores; ++sub) {
 		pvc = core_info.vc[sub];
 		post_guest_process(pvc, pvc == vc);
 	}
 
 	spin_lock(&vc->lock);
-	preempt_enable();
 
  out:
 	vc->vcore_state = VCORE_INACTIVE;

From 07ae5389e98c53bb9e9f308fce9c903bc3ee7720 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 31 Jan 2018 22:24:58 +0100
Subject: [PATCH 236/676] KVM: PPC: Book3S PR: Fix svcpu copying with
 preemption enabled

When copying between the vcpu and svcpu, we may get scheduled away onto
a different host CPU which in turn means our svcpu pointer may change.

That means we need to atomically copy to and from the svcpu with preemption
disabled, so that all code around it always sees a coherent state.

Reported-by: Simon Guo <wei.guo.simon@gmail.com>
Fixes: 3d3319b45eea ("KVM: PPC: Book3S: PR: Enable interrupts earlier")
Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s.h |  6 ++----
 arch/powerpc/kvm/book3s_interrupts.S  |  4 +---
 arch/powerpc/kvm/book3s_pr.c          | 20 +++++++++-----------
 3 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 9a667007bff81..376ae803b69c6 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -249,10 +249,8 @@ extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
 extern void kvmppc_pr_init_default_hcalls(struct kvm *kvm);
 extern int kvmppc_hcall_impl_pr(unsigned long cmd);
 extern int kvmppc_hcall_impl_hv_realmode(unsigned long cmd);
-extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
-				 struct kvm_vcpu *vcpu);
-extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
-				   struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu);
+extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu);
 extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index 901e6fe00c39c..c18e845019ec5 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -96,7 +96,7 @@ kvm_start_entry:
 
 kvm_start_lightweight:
 	/* Copy registers into shadow vcpu so we can access them in real mode */
-	GET_SHADOW_VCPU(r3)
+	mr	r3, r4
 	bl	FUNC(kvmppc_copy_to_svcpu)
 	nop
 	REST_GPR(4, r1)
@@ -165,9 +165,7 @@ after_sprg3_load:
 	stw	r12, VCPU_TRAP(r3)
 
 	/* Transfer reg values from shadow vcpu back to vcpu struct */
-	/* On 64-bit, interrupts are still off at this point */
 
-	GET_SHADOW_VCPU(r4)
 	bl	FUNC(kvmppc_copy_from_svcpu)
 	nop
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d0dc8624198f8..d6df74a2f7449 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -120,7 +120,7 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 	if (svcpu->in_use) {
-		kvmppc_copy_from_svcpu(vcpu, svcpu);
+		kvmppc_copy_from_svcpu(vcpu);
 	}
 	memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
 	to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
@@ -142,9 +142,10 @@ static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
 }
 
 /* Copy data needed by real-mode code from vcpu to shadow vcpu */
-void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
-			  struct kvm_vcpu *vcpu)
+void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+
 	svcpu->gpr[0] = vcpu->arch.gpr[0];
 	svcpu->gpr[1] = vcpu->arch.gpr[1];
 	svcpu->gpr[2] = vcpu->arch.gpr[2];
@@ -176,17 +177,14 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		vcpu->arch.entry_ic = mfspr(SPRN_IC);
 	svcpu->in_use = true;
+
+	svcpu_put(svcpu);
 }
 
 /* Copy data touched by real-mode code from shadow vcpu back to vcpu */
-void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
-			    struct kvmppc_book3s_shadow_vcpu *svcpu)
+void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 {
-	/*
-	 * vcpu_put would just call us again because in_use hasn't
-	 * been updated yet.
-	 */
-	preempt_disable();
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
 
 	/*
 	 * Maybe we were already preempted and synced the svcpu from
@@ -232,7 +230,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
 	svcpu->in_use = false;
 
 out:
-	preempt_enable();
+	svcpu_put(svcpu);
 }
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)

From d71ef28636e435079028c1ed255fa92d8ff6ed76 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:03 +0100
Subject: [PATCH 237/676] mt76: implement AP_LINK_PS

With software A-MPDU reordering in place, frames that notify mac80211 of
powersave changes are reordered as well, which can cause connection
stalls. Fix this by implementing powersave state processing in the
driver.

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mac80211.c | 52 ++++++++++++++++++-
 drivers/net/wireless/mediatek/mt76/mt76.h     | 10 ++++
 drivers/net/wireless/mediatek/mt76/mt76x2.h   |  2 +
 .../net/wireless/mediatek/mt76/mt76x2_init.c  |  1 +
 .../net/wireless/mediatek/mt76/mt76x2_main.c  | 28 +++++-----
 5 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 5fcb2deb89a24..85f8d324ebf82 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -276,6 +276,7 @@ int mt76_register_device(struct mt76_dev *dev, bool vht,
 	ieee80211_hw_set(hw, TX_AMSDU);
 	ieee80211_hw_set(hw, TX_FRAG_LIST);
 	ieee80211_hw_set(hw, MFP_CAPABLE);
+	ieee80211_hw_set(hw, AP_LINK_PS);
 
 	wiphy->flags |= WIPHY_FLAG_IBSS_RSN;
 
@@ -470,6 +471,53 @@ mt76_check_ccmp_pn(struct sk_buff *skb)
 	return 0;
 }
 
+static void
+mt76_check_ps(struct mt76_dev *dev, struct sk_buff *skb)
+{
+	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_sta *sta;
+	struct mt76_wcid *wcid = status->wcid;
+	bool ps;
+
+	if (!wcid || !wcid->sta)
+		return;
+
+	sta = container_of((void *) wcid, struct ieee80211_sta, drv_priv);
+
+	if (!test_bit(MT_WCID_FLAG_CHECK_PS, &wcid->flags))
+		return;
+
+	if (ieee80211_is_pspoll(hdr->frame_control)) {
+		ieee80211_sta_pspoll(sta);
+		return;
+	}
+
+	if (ieee80211_has_morefrags(hdr->frame_control) ||
+		!(ieee80211_is_mgmt(hdr->frame_control) ||
+		  ieee80211_is_data(hdr->frame_control)))
+		return;
+
+	ps = ieee80211_has_pm(hdr->frame_control);
+
+	if (ps && (ieee80211_is_data_qos(hdr->frame_control) ||
+		   ieee80211_is_qos_nullfunc(hdr->frame_control)))
+		ieee80211_sta_uapsd_trigger(sta, status->tid);
+
+	if (!!test_bit(MT_WCID_FLAG_PS, &wcid->flags) == ps)
+		return;
+
+	if (ps) {
+		set_bit(MT_WCID_FLAG_PS, &wcid->flags);
+		mt76_stop_tx_queues(dev, sta, true);
+	} else {
+		clear_bit(MT_WCID_FLAG_PS, &wcid->flags);
+	}
+
+	ieee80211_sta_ps_transition(sta, ps);
+	dev->drv->sta_ps(dev, sta, ps);
+}
+
 void mt76_rx_complete(struct mt76_dev *dev, struct sk_buff_head *frames,
 		      int queue)
 {
@@ -498,8 +546,10 @@ void mt76_rx_poll_complete(struct mt76_dev *dev, enum mt76_rxq_id q)
 
 	__skb_queue_head_init(&frames);
 
-	while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL)
+	while ((skb = __skb_dequeue(&dev->rx_skb[q])) != NULL) {
+		mt76_check_ps(dev, skb);
 		mt76_rx_aggr_reorder(skb, &frames);
+	}
 
 	mt76_rx_complete(dev, &frames, q);
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 129015c9d1169..d2ce15093eddd 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -121,11 +121,18 @@ struct mt76_queue_ops {
 	void (*kick)(struct mt76_dev *dev, struct mt76_queue *q);
 };
 
+enum mt76_wcid_flags {
+	MT_WCID_FLAG_CHECK_PS,
+	MT_WCID_FLAG_PS,
+};
+
 struct mt76_wcid {
 	struct mt76_rx_tid __rcu *aggr[IEEE80211_NUM_TIDS];
 
 	struct work_struct aggr_work;
 
+	unsigned long flags;
+
 	u8 idx;
 	u8 hw_key_idx;
 
@@ -206,6 +213,9 @@ struct mt76_driver_ops {
 		       struct sk_buff *skb);
 
 	void (*rx_poll_complete)(struct mt76_dev *dev, enum mt76_rxq_id q);
+
+	void (*sta_ps)(struct mt76_dev *dev, struct ieee80211_sta *sta,
+		       bool ps);
 };
 
 struct mt76_channel_state {
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2.h b/drivers/net/wireless/mediatek/mt76/mt76x2.h
index 17df17afd9bf9..e62131b881020 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2.h
@@ -218,6 +218,8 @@ void mt76x2_rx_poll_complete(struct mt76_dev *mdev, enum mt76_rxq_id q);
 void mt76x2_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q,
 			 struct sk_buff *skb);
 
+void mt76x2_sta_ps(struct mt76_dev *dev, struct ieee80211_sta *sta, bool ps);
+
 void mt76x2_update_channel(struct mt76_dev *mdev);
 
 s8 mt76x2_tx_get_max_txpwr_adj(struct mt76x2_dev *dev,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
index 1b00ae4465a27..9dbf94947324e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_init.c
@@ -630,6 +630,7 @@ struct mt76x2_dev *mt76x2_alloc_device(struct device *pdev)
 		.tx_complete_skb = mt76x2_tx_complete_skb,
 		.rx_skb = mt76x2_queue_rx_skb,
 		.rx_poll_complete = mt76x2_rx_poll_complete,
+		.sta_ps = mt76x2_sta_ps,
 	};
 	struct ieee80211_hw *hw;
 	struct mt76x2_dev *dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
index bf26284b9989d..205043b470b20 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_main.c
@@ -282,6 +282,9 @@ mt76x2_sta_add(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	for (i = 0; i < ARRAY_SIZE(sta->txq); i++)
 		mt76x2_txq_init(dev, sta->txq[i]);
 
+	if (vif->type == NL80211_IFTYPE_AP)
+		set_bit(MT_WCID_FLAG_CHECK_PS, &msta->wcid.flags);
+
 	rcu_assign_pointer(dev->wcid[idx], &msta->wcid);
 
 out:
@@ -311,23 +314,14 @@ mt76x2_sta_remove(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	return 0;
 }
 
-static void
-mt76x2_sta_notify(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
-		  enum sta_notify_cmd cmd, struct ieee80211_sta *sta)
+void
+mt76x2_sta_ps(struct mt76_dev *mdev, struct ieee80211_sta *sta, bool ps)
 {
 	struct mt76x2_sta *msta = (struct mt76x2_sta *) sta->drv_priv;
-	struct mt76x2_dev *dev = hw->priv;
+	struct mt76x2_dev *dev = container_of(mdev, struct mt76x2_dev, mt76);
 	int idx = msta->wcid.idx;
 
-	switch (cmd) {
-	case STA_NOTIFY_SLEEP:
-		mt76x2_mac_wcid_set_drop(dev, idx, true);
-		mt76_stop_tx_queues(&dev->mt76, sta, true);
-		break;
-	case STA_NOTIFY_AWAKE:
-		mt76x2_mac_wcid_set_drop(dev, idx, false);
-		break;
-	}
+	mt76x2_mac_wcid_set_drop(dev, idx, ps);
 }
 
 static int
@@ -549,6 +543,12 @@ static void mt76x2_set_coverage_class(struct ieee80211_hw *hw,
 	mutex_unlock(&dev->mutex);
 }
 
+static int
+mt76x2_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set)
+{
+	return 0;
+}
+
 const struct ieee80211_ops mt76x2_ops = {
 	.tx = mt76x2_tx,
 	.start = mt76x2_start,
@@ -560,7 +560,6 @@ const struct ieee80211_ops mt76x2_ops = {
 	.bss_info_changed = mt76x2_bss_info_changed,
 	.sta_add = mt76x2_sta_add,
 	.sta_remove = mt76x2_sta_remove,
-	.sta_notify = mt76x2_sta_notify,
 	.set_key = mt76x2_set_key,
 	.conf_tx = mt76x2_conf_tx,
 	.sw_scan_start = mt76x2_sw_scan,
@@ -573,5 +572,6 @@ const struct ieee80211_ops mt76x2_ops = {
 	.release_buffered_frames = mt76_release_buffered_frames,
 	.set_coverage_class = mt76x2_set_coverage_class,
 	.get_survey = mt76_get_survey,
+	.set_tim = mt76x2_set_tim,
 };
 

From 17cf68b702a60aee61432d59098b1ba6ceab2f98 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:04 +0100
Subject: [PATCH 238/676] mt76: implement processing of BlockAckReq frames

Avoids timeouts on reordered A-MPDU rx frames

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 34 ++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index 8027bb7c03c22..e9784b50e2af3 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -113,6 +113,33 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 	local_bh_enable();
 }
 
+static void
+mt76_rx_aggr_check_ctl(struct sk_buff *skb, struct sk_buff_head *frames)
+{
+	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
+	struct ieee80211_bar *bar = (struct ieee80211_bar *) skb->data;
+	struct mt76_wcid *wcid = status->wcid;
+	struct mt76_rx_tid *tid;
+	u16 seqno;
+
+	if (!ieee80211_is_ctl(bar->frame_control))
+		return;
+
+	if (!ieee80211_is_back_req(bar->frame_control))
+		return;
+
+	status->tid = le16_to_cpu(bar->control) >> 12;
+	seqno = le16_to_cpu(bar->start_seq_num) >> 4;
+	tid = rcu_dereference(wcid->aggr[status->tid]);
+	if (!tid)
+		return;
+
+	spin_lock_bh(&tid->lock);
+	mt76_rx_aggr_release_frames(tid, frames, seqno);
+	mt76_rx_aggr_release_head(tid, frames);
+	spin_unlock_bh(&tid->lock);
+}
+
 void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 {
 	struct mt76_rx_status *status = (struct mt76_rx_status *) skb->cb;
@@ -126,9 +153,14 @@ void mt76_rx_aggr_reorder(struct sk_buff *skb, struct sk_buff_head *frames)
 	__skb_queue_tail(frames, skb);
 
 	sta = wcid_to_sta(wcid);
-	if (!sta || !status->aggr)
+	if (!sta)
 		return;
 
+	if (!status->aggr) {
+		mt76_rx_aggr_check_ctl(skb, frames);
+		return;
+	}
+
 	tid = rcu_dereference(wcid->aggr[status->tid]);
 	if (!tid)
 		return;

From fb208dc73ff1667191ba26d87610bba983ea1535 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:05 +0100
Subject: [PATCH 239/676] mt76: avoid re-queueing A-MPDU rx reorder work if no
 frames are pending

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/agg-rx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/agg-rx.c b/drivers/net/wireless/mediatek/mt76/agg-rx.c
index e9784b50e2af3..fcb208d1f2762 100644
--- a/drivers/net/wireless/mediatek/mt76/agg-rx.c
+++ b/drivers/net/wireless/mediatek/mt76/agg-rx.c
@@ -98,6 +98,7 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 					       reorder_work.work);
 	struct mt76_dev *dev = tid->dev;
 	struct sk_buff_head frames;
+	int nframes;
 
 	__skb_queue_head_init(&frames);
 
@@ -105,9 +106,12 @@ mt76_rx_aggr_reorder_work(struct work_struct *work)
 
 	spin_lock(&tid->lock);
 	mt76_rx_aggr_check_release(tid, &frames);
+	nframes = tid->nframes;
 	spin_unlock(&tid->lock);
 
-	ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work, REORDER_TIMEOUT);
+	if (nframes)
+		ieee80211_queue_delayed_work(tid->dev->hw, &tid->reorder_work,
+					     REORDER_TIMEOUT);
 	mt76_rx_complete(dev, &frames, -1);
 
 	local_bh_enable();

From cbbde7e8d98542dc1777ac42ec2a08875aea26ee Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 27 Jan 2018 16:02:06 +0100
Subject: [PATCH 240/676] mt76: do not set status->aggr for NULL data frames

Avoids data connection stalls when the client toggles powersave mode

Fixes: aee5b8cf2477 ("mt76: implement A-MPDU rx reordering in the driver code")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/mediatek/mt76/mt76x2_mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
index 6c30b5eaa9ca5..7ea3d841918e9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2_mac.c
@@ -341,7 +341,7 @@ int mt76x2_mac_process_rx(struct mt76x2_dev *dev, struct sk_buff *skb,
 
 	mt76x2_remove_hdr_pad(skb, pad_len);
 
-	if (rxinfo & MT_RXINFO_BA)
+	if ((rxinfo & MT_RXINFO_BA) && !(rxinfo & MT_RXINFO_NULL))
 		status->aggr = true;
 
 	if (WARN_ON_ONCE(len > skb->len))

From 63e2480c86bd610c1d52fdbb76dc6ebea08e518d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 10:02:53 -0500
Subject: [PATCH 241/676] smc: missing poll annotations

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/smc/af_smc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3583c8ab1baee..ba4b84debc5a0 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1141,7 +1141,7 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 static __poll_t smc_accept_poll(struct sock *parent)
 {
 	struct smc_sock *isk = smc_sk(parent);
-	int mask = 0;
+	__poll_t mask = 0;
 
 	spin_lock(&isk->accept_q_lock);
 	if (!list_empty(&isk->accept_q))

From 0148a635ce40d65653bfda469fae8e4b8360baf3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 10:07:32 -0500
Subject: [PATCH 242/676] xen: fix poll misannotation

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/xen/pvcalls-front.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/pvcalls-front.h b/drivers/xen/pvcalls-front.h
index 3332978f4fcd3..f694ad77379fc 100644
--- a/drivers/xen/pvcalls-front.h
+++ b/drivers/xen/pvcalls-front.h
@@ -20,7 +20,7 @@ int pvcalls_front_recvmsg(struct socket *sock,
 			  struct msghdr *msg,
 			  size_t len,
 			  int flags);
-unsigned int pvcalls_front_poll(struct file *file,
+__poll_t pvcalls_front_poll(struct file *file,
 				struct socket *sock,
 				poll_table *wait);
 int pvcalls_front_release(struct socket *sock);

From cfe39442ab8ce9670b4ddd04291b8cddb9cb1129 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 12:14:57 -0500
Subject: [PATCH 243/676] use linux/poll.h instead of asm/poll.h

The only place that has any business including asm/poll.h
is linux/poll.h.  Fortunately, asm/poll.h had only been
included in 3 places beyond that one, and all of them
are trivial to switch to using linux/poll.h.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/psdev.c   | 2 +-
 fs/debugfs/file.c | 2 +-
 fs/fcntl.c        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 49d3c6fda89a0..80b9b84391a9d 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -39,7 +39,7 @@
 #include <linux/device.h>
 #include <linux/pid_namespace.h>
 #include <asm/io.h>
-#include <asm/poll.h>
+#include <linux/poll.h>
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6fdbf21be3184..20bb73a931dd2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -18,7 +18,7 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
-#include <asm/poll.h>
+#include <linux/poll.h>
 
 #include "internal.h"
 
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c7b9e09481078..ea3629c15c442 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -26,7 +26,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/compat.h>
 
-#include <asm/poll.h>
+#include <linux/poll.h>
 #include <asm/siginfo.h>
 #include <linux/uaccess.h>
 

From 65aaf87b3aa2d049c6b9fd85221858a895df3393 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 11:00:50 -0500
Subject: [PATCH 244/676] add EPOLLNVAL, annotate EPOLL... and
 event_poll->event

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/uapi/linux/eventpoll.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index 63e21be30f150..bf48e71f26341 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -28,20 +28,21 @@
 #define EPOLL_CTL_MOD 3
 
 /* Epoll event masks */
-#define EPOLLIN		0x00000001
-#define EPOLLPRI	0x00000002
-#define EPOLLOUT	0x00000004
-#define EPOLLERR	0x00000008
-#define EPOLLHUP	0x00000010
-#define EPOLLRDNORM	0x00000040
-#define EPOLLRDBAND	0x00000080
-#define EPOLLWRNORM	0x00000100
-#define EPOLLWRBAND	0x00000200
-#define EPOLLMSG	0x00000400
-#define EPOLLRDHUP	0x00002000
+#define EPOLLIN		(__force __poll_t)0x00000001
+#define EPOLLPRI	(__force __poll_t)0x00000002
+#define EPOLLOUT	(__force __poll_t)0x00000004
+#define EPOLLERR	(__force __poll_t)0x00000008
+#define EPOLLHUP	(__force __poll_t)0x00000010
+#define EPOLLNVAL	(__force __poll_t)0x00000020
+#define EPOLLRDNORM	(__force __poll_t)0x00000040
+#define EPOLLRDBAND	(__force __poll_t)0x00000080
+#define EPOLLWRNORM	(__force __poll_t)0x00000100
+#define EPOLLWRBAND	(__force __poll_t)0x00000200
+#define EPOLLMSG	(__force __poll_t)0x00000400
+#define EPOLLRDHUP	(__force __poll_t)0x00002000
 
 /* Set exclusive wakeup mode for the target file descriptor */
-#define EPOLLEXCLUSIVE (1U << 28)
+#define EPOLLEXCLUSIVE (__force __poll_t)(1U << 28)
 
 /*
  * Request the handling of system wakeup events so as to prevent system suspends
@@ -53,13 +54,13 @@
  *
  * Requires CAP_BLOCK_SUSPEND
  */
-#define EPOLLWAKEUP (1U << 29)
+#define EPOLLWAKEUP (__force __poll_t)(1U << 29)
 
 /* Set the One Shot behaviour for the target file descriptor */
-#define EPOLLONESHOT (1U << 30)
+#define EPOLLONESHOT (__force __poll_t)(1U << 30)
 
 /* Set the Edge Triggered behaviour for the target file descriptor */
-#define EPOLLET (1U << 31)
+#define EPOLLET (__force __poll_t)(1U << 31)
 
 /* 
  * On x86-64 make the 64bit structure have the same alignment as the
@@ -74,7 +75,7 @@
 #endif
 
 struct epoll_event {
-	__u32 events;
+	__poll_t events;
 	__u64 data;
 } EPOLL_PACKED;
 

From e78cd95bebd94ede71285722f5bf2464bfa4c599 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 11:01:35 -0500
Subject: [PATCH 245/676] preparation to switching ->poll() to returning
 EPOLL...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/poll.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/poll.h b/include/linux/poll.h
index 04781a7533262..d23104b329317 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -11,6 +11,7 @@
 #include <linux/sysctl.h>
 #include <linux/uaccess.h>
 #include <uapi/linux/poll.h>
+#include <uapi/linux/eventpoll.h>
 
 extern struct ctl_table epoll_table[]; /* for sysctl */
 /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
@@ -22,7 +23,7 @@ extern struct ctl_table epoll_table[]; /* for sysctl */
 #define WQUEUES_STACK_ALLOC	(MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
 #define N_INLINE_POLL_ENTRIES	(WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
 
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
+#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
 
 struct poll_table_struct;
 

From d7ebbe46f445d8c64fa37812818597dc466d74bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 15:21:13 -0500
Subject: [PATCH 246/676] ep_send_events_proc(): return result via esed->res

preparations for not mixing __poll_t and int in ep_scan_ready_list()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42e35a6977c9e..87452875eaec5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -260,6 +260,7 @@ struct ep_pqueue {
 struct ep_send_events_data {
 	int maxevents;
 	struct epoll_event __user *events;
+	int res;
 };
 
 /*
@@ -1616,7 +1617,6 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 			       void *priv)
 {
 	struct ep_send_events_data *esed = priv;
-	int eventcnt;
 	unsigned int revents;
 	struct epitem *epi;
 	struct epoll_event __user *uevent;
@@ -1630,8 +1630,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 	 * Items cannot vanish during the loop because ep_scan_ready_list() is
 	 * holding "mtx" during this call.
 	 */
-	for (eventcnt = 0, uevent = esed->events;
-	     !list_empty(head) && eventcnt < esed->maxevents;) {
+	for (esed->res = 0, uevent = esed->events;
+	     !list_empty(head) && esed->res < esed->maxevents;) {
 		epi = list_first_entry(head, struct epitem, rdllink);
 
 		/*
@@ -1665,9 +1665,11 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 			    __put_user(epi->event.data, &uevent->data)) {
 				list_add(&epi->rdllink, head);
 				ep_pm_stay_awake(epi);
-				return eventcnt ? eventcnt : -EFAULT;
+				if (!esed->res)
+					esed->res = -EFAULT;
+				return 0;
 			}
-			eventcnt++;
+			esed->res++;
 			uevent++;
 			if (epi->event.events & EPOLLONESHOT)
 				epi->event.events &= EP_PRIVATE_BITS;
@@ -1689,7 +1691,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 		}
 	}
 
-	return eventcnt;
+	return 0;
 }
 
 static int ep_send_events(struct eventpoll *ep,
@@ -1700,7 +1702,8 @@ static int ep_send_events(struct eventpoll *ep,
 	esed.maxevents = maxevents;
 	esed.events = events;
 
-	return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
+	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
+	return esed.res;
 }
 
 static inline struct timespec64 ep_set_mstimeout(long ms)

From d85e2aa2e34dac793e70b900d865f48c69ecbc27 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 15:24:58 -0500
Subject: [PATCH 247/676] annotate ep_scan_ready_list()

make it always return __poll_t and have its callbacks do the same

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 87452875eaec5..d1a490c7e6c3c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -661,12 +661,13 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
  *
  * Returns: The same integer error code returned by the @sproc callback.
  */
-static int ep_scan_ready_list(struct eventpoll *ep,
-			      int (*sproc)(struct eventpoll *,
+static __poll_t ep_scan_ready_list(struct eventpoll *ep,
+			      __poll_t (*sproc)(struct eventpoll *,
 					   struct list_head *, void *),
 			      void *priv, int depth, bool ep_locked)
 {
-	int error, pwake = 0;
+	__poll_t res;
+	int pwake = 0;
 	unsigned long flags;
 	struct epitem *epi, *nepi;
 	LIST_HEAD(txlist);
@@ -695,7 +696,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	/*
 	 * Now call the callback function.
 	 */
-	error = (*sproc)(ep, &txlist, priv);
+	res = (*sproc)(ep, &txlist, priv);
 
 	spin_lock_irqsave(&ep->lock, flags);
 	/*
@@ -748,7 +749,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
 	if (pwake)
 		ep_poll_safewake(&ep->poll_wait);
 
-	return error;
+	return res;
 }
 
 static void epi_rcu_free(struct rcu_head *head)
@@ -865,7 +866,7 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 			       void *priv);
 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 				 poll_table *pt);
@@ -875,7 +876,7 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
  * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
  * is correctly annotated.
  */
-static unsigned int ep_item_poll(const struct epitem *epi, poll_table *pt,
+static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
 				 int depth)
 {
 	struct eventpoll *ep;
@@ -895,7 +896,7 @@ static unsigned int ep_item_poll(const struct epitem *epi, poll_table *pt,
 				  locked) & epi->event.events;
 }
 
-static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
 			       void *priv)
 {
 	struct epitem *epi, *tmp;
@@ -1415,7 +1416,8 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
 static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		     struct file *tfile, int fd, int full_check)
 {
-	int error, revents, pwake = 0;
+	int error, pwake = 0;
+	__poll_t revents;
 	unsigned long flags;
 	long user_watches;
 	struct epitem *epi;
@@ -1613,11 +1615,11 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
 	return 0;
 }
 
-static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
+static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
 			       void *priv)
 {
 	struct ep_send_events_data *esed = priv;
-	unsigned int revents;
+	__poll_t revents;
 	struct epitem *epi;
 	struct epoll_event __user *uevent;
 	struct wakeup_source *ws;

From 366b77ae43c5a3bf1a367f15ec8bc16e05035f14 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 23 Jan 2018 13:58:05 +0100
Subject: [PATCH 248/676] s390/eadm: fix CONFIG_BLOCK include dependency

Commit 2a842acab109 ("block: introduce new block status code type")
added blk_status_t usage to the eadm subchannel driver. However
blk_status_t is unknown when included via <linux/blkdev.h> for CONFIG_BLOCK=n.

Only include <linux/blk_types.h> since this is the only dependency eadm has.

This fixes build failures like below:
In file included from drivers/s390/cio/eadm_sch.c:24:0:
./arch/s390/include/asm/eadm.h:111:4: error: unknown type name 'blk_status_t'; did you mean 'si_status'?
    blk_status_t error);

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/eadm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/eadm.h b/arch/s390/include/asm/eadm.h
index eb5323161f11e..bb63b2afdf6fc 100644
--- a/arch/s390/include/asm/eadm.h
+++ b/arch/s390/include/asm/eadm.h
@@ -4,7 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/device.h>
-#include <linux/blkdev.h>
+#include <linux/blk_types.h>
 
 struct arqb {
 	u64 data;

From f3ea8419b9f64a27c279bcd38afc920b0fcaef1e Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Tue, 23 Jan 2018 10:46:33 +0100
Subject: [PATCH 249/676] s390/cmf: fix kerneldoc

Make sure we use proper Return sections, and make the output
for cmf_enable() less odd.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/cmf.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 5e495c62cfa77..8af4948dae806 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -1118,9 +1118,10 @@ int ccw_set_cmf(struct ccw_device *cdev, int enable)
  * enable_cmf() - switch on the channel measurement for a specific device
  *  @cdev:	The ccw device to be enabled
  *
- *  Returns %0 for success or a negative error value.
- *  Note: If this is called on a device for which channel measurement is already
- *	  enabled a reset of the measurement data is triggered.
+ *  Enable channel measurements for @cdev. If this is called on a device
+ *  for which channel measurement is already enabled a reset of the
+ *  measurement data is triggered.
+ *  Returns: %0 for success or a negative error value.
  *  Context:
  *    non-atomic
  */
@@ -1160,7 +1161,7 @@ int enable_cmf(struct ccw_device *cdev)
  * __disable_cmf() - switch off the channel measurement for a specific device
  *  @cdev:	The ccw device to be disabled
  *
- *  Returns %0 for success or a negative error value.
+ *  Returns: %0 for success or a negative error value.
  *
  *  Context:
  *    non-atomic, device_lock() held.
@@ -1184,7 +1185,7 @@ int __disable_cmf(struct ccw_device *cdev)
  * disable_cmf() - switch off the channel measurement for a specific device
  *  @cdev:	The ccw device to be disabled
  *
- *  Returns %0 for success or a negative error value.
+ *  Returns: %0 for success or a negative error value.
  *
  *  Context:
  *    non-atomic
@@ -1205,7 +1206,7 @@ int disable_cmf(struct ccw_device *cdev)
  * @cdev:	the channel to be read
  * @index:	the index of the value to be read
  *
- * Returns the value read or %0 if the value cannot be read.
+ * Returns: The value read or %0 if the value cannot be read.
  *
  *  Context:
  *    any
@@ -1220,7 +1221,7 @@ u64 cmf_read(struct ccw_device *cdev, int index)
  * @cdev:	the channel to be read
  * @data:	a pointer to a data block that will be filled
  *
- * Returns %0 on success, a negative error value otherwise.
+ * Returns: %0 on success, a negative error value otherwise.
  *
  *  Context:
  *    any

From 7ddd091341336f8f809efe4049e2d8fbe5366328 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Tue, 11 Jul 2017 15:44:09 +0200
Subject: [PATCH 250/676] s390/docs: mention subchannel types

Since the original inception of the s390-drivers document, the
common I/O layer has grown support for more types of subchannels.
Give at least a pointer for the various types.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 Documentation/driver-api/s390-drivers.rst | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index ecf8851d35651..42350f21357df 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -22,9 +22,28 @@ While most I/O devices on a s390 system are typically driven through the
 channel I/O mechanism described here, there are various other methods
 (like the diag interface). These are out of the scope of this document.
 
+The s390 common I/O layer also provides access to some devices that are
+not strictly considered I/O devices. They are considered here as well,
+although they are not the focus of this document.
+
 Some additional information can also be found in the kernel source under
 Documentation/s390/driver-model.txt.
 
+The css bus
+===========
+
+The css bus contains the subchannels available on the system. They fall
+into several categories:
+
+* Standard I/O subchannels, for use by the system. They have a child
+  device on the ccw bus and are described below.
+* I/O subchannels bound to the vfio-ccw driver. See
+  Documentation/s390/vfio-ccw.txt.
+* Message subchannels. No Linux driver currently exists.
+* CHSC subchannels (at most one). The chsc subchannel driver can be used
+  to send asynchronous chsc commands.
+* eADM subchannels. Used for talking to storage class memory.
+
 The ccw bus
 ===========
 

From 0f0929bc5ca321ea9bdcf5b9be9246dd0dc3f1bf Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@redhat.com>
Date: Tue, 11 Jul 2017 16:04:45 +0200
Subject: [PATCH 251/676] s390/docs: reword airq section

Also mention the iv helpers as well.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 Documentation/driver-api/s390-drivers.rst | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index 42350f21357df..30e6aa7e160b7 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -121,10 +121,15 @@ ccw group devices
 Generic interfaces
 ==================
 
-Some interfaces are available to other drivers that do not necessarily
-have anything to do with the busses described above, but still are
-indirectly using basic infrastructure in the common I/O layer. One
-example is the support for adapter interrupts.
+The following section contains interfaces in use not only by drivers
+dealing with ccw devices, but drivers for various other s390 hardware
+as well.
+
+Adapter interrupts
+------------------
+
+The common I/O layer provides helper functions for dealing with adapter
+interrupts and interrupt vectors.
 
 .. kernel-doc:: drivers/s390/cio/airq.c
    :export:

From 5260b0f50cf371d7a0920fd163571adfd80e2109 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 16:19:53 +0100
Subject: [PATCH 252/676] s390/sysinfo: add and display licensed internal code
 identifier

With z14, the store system information instruction provides an
licensed internal code identifier.  Display it in /proc/sysinfo.

For more information, see the z/Architecture Principles of Operation.
(SA22-7832-11).

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/sysinfo.h | 3 ++-
 arch/s390/kernel/sysinfo.c      | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 25057c118d563..fe7b3f8f07913 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -21,7 +21,8 @@ struct sysinfo_1_1_1 {
 	unsigned char :8;
 	unsigned char ccr;
 	unsigned char cai;
-	char reserved_0[28];
+	char reserved_0[20];
+	unsigned long lic;
 	char manufacturer[16];
 	char type[4];
 	char reserved_1[12];
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index a441cba8d165c..fc7e04c2195bb 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -89,6 +89,8 @@ static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info)
 	EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap));
 	seq_printf(m, "Manufacturer:         %-16.16s\n", info->manufacturer);
 	seq_printf(m, "Type:                 %-4.4s\n", info->type);
+	if (info->lic)
+		seq_printf(m, "LIC Identifier:       %016lx\n", info->lic);
 	/*
 	 * Sigh: the model field has been renamed with System z9
 	 * to model_capacity and a new model field has been added

From df2f815a7df7edb5335a3bdeee6a8f9f6f9c35c4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 22 Jan 2018 10:26:20 +0100
Subject: [PATCH 253/676] s390/runtime instrumentation: provide uapi header
 file

The man page for the s390_runtime_instr() system call refers to a
header file that was never exported. Therefore it's time to finally
provide the header file, and move large parts of the runtime
instrumentation header file to a new uapi header file.

Reported-by: Eugene Syromyatnikov <evgsyr@gmail.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/runtime_instr.h      | 67 +-------------------
 arch/s390/include/uapi/asm/runtime_instr.h | 74 ++++++++++++++++++++++
 2 files changed, 75 insertions(+), 66 deletions(-)
 create mode 100644 arch/s390/include/uapi/asm/runtime_instr.h

diff --git a/arch/s390/include/asm/runtime_instr.h b/arch/s390/include/asm/runtime_instr.h
index 6b1540337ed6c..0e1605538cd47 100644
--- a/arch/s390/include/asm/runtime_instr.h
+++ b/arch/s390/include/asm/runtime_instr.h
@@ -2,75 +2,10 @@
 #ifndef _RUNTIME_INSTR_H
 #define _RUNTIME_INSTR_H
 
-#define S390_RUNTIME_INSTR_START	0x1
-#define S390_RUNTIME_INSTR_STOP		0x2
-
-struct runtime_instr_cb {
-	__u64 rca;
-	__u64 roa;
-	__u64 rla;
-
-	__u32 v			: 1;
-	__u32 s			: 1;
-	__u32 k			: 1;
-	__u32 h			: 1;
-	__u32 a			: 1;
-	__u32 reserved1		: 3;
-	__u32 ps		: 1;
-	__u32 qs		: 1;
-	__u32 pc		: 1;
-	__u32 qc		: 1;
-	__u32 reserved2		: 1;
-	__u32 g			: 1;
-	__u32 u			: 1;
-	__u32 l			: 1;
-	__u32 key		: 4;
-	__u32 reserved3		: 8;
-	__u32 t			: 1;
-	__u32 rgs		: 3;
-
-	__u32 m			: 4;
-	__u32 n			: 1;
-	__u32 mae		: 1;
-	__u32 reserved4		: 2;
-	__u32 c			: 1;
-	__u32 r			: 1;
-	__u32 b			: 1;
-	__u32 j			: 1;
-	__u32 e			: 1;
-	__u32 x			: 1;
-	__u32 reserved5		: 2;
-	__u32 bpxn		: 1;
-	__u32 bpxt		: 1;
-	__u32 bpti		: 1;
-	__u32 bpni		: 1;
-	__u32 reserved6		: 2;
-
-	__u32 d			: 1;
-	__u32 f			: 1;
-	__u32 ic		: 4;
-	__u32 dc		: 4;
-
-	__u64 reserved7;
-	__u64 sf;
-	__u64 rsic;
-	__u64 reserved8;
-} __packed __aligned(8);
+#include <uapi/asm/runtime_instr.h>
 
 extern struct runtime_instr_cb runtime_instr_empty_cb;
 
-static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb)
-{
-	asm volatile(".insn	rsy,0xeb0000000060,0,0,%0"	/* LRIC */
-		: : "Q" (*cb));
-}
-
-static inline void store_runtime_instr_cb(struct runtime_instr_cb *cb)
-{
-	asm volatile(".insn	rsy,0xeb0000000061,0,0,%0"	/* STRIC */
-		: "=Q" (*cb) : : "cc");
-}
-
 static inline void save_ri_cb(struct runtime_instr_cb *cb_prev)
 {
 	if (cb_prev)
diff --git a/arch/s390/include/uapi/asm/runtime_instr.h b/arch/s390/include/uapi/asm/runtime_instr.h
new file mode 100644
index 0000000000000..45c9ec984e6bd
--- /dev/null
+++ b/arch/s390/include/uapi/asm/runtime_instr.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _S390_UAPI_RUNTIME_INSTR_H
+#define _S390_UAPI_RUNTIME_INSTR_H
+
+#include <linux/types.h>
+
+#define S390_RUNTIME_INSTR_START	0x1
+#define S390_RUNTIME_INSTR_STOP		0x2
+
+struct runtime_instr_cb {
+	__u64 rca;
+	__u64 roa;
+	__u64 rla;
+
+	__u32 v			: 1;
+	__u32 s			: 1;
+	__u32 k			: 1;
+	__u32 h			: 1;
+	__u32 a			: 1;
+	__u32 reserved1		: 3;
+	__u32 ps		: 1;
+	__u32 qs		: 1;
+	__u32 pc		: 1;
+	__u32 qc		: 1;
+	__u32 reserved2		: 1;
+	__u32 g			: 1;
+	__u32 u			: 1;
+	__u32 l			: 1;
+	__u32 key		: 4;
+	__u32 reserved3		: 8;
+	__u32 t			: 1;
+	__u32 rgs		: 3;
+
+	__u32 m			: 4;
+	__u32 n			: 1;
+	__u32 mae		: 1;
+	__u32 reserved4		: 2;
+	__u32 c			: 1;
+	__u32 r			: 1;
+	__u32 b			: 1;
+	__u32 j			: 1;
+	__u32 e			: 1;
+	__u32 x			: 1;
+	__u32 reserved5		: 2;
+	__u32 bpxn		: 1;
+	__u32 bpxt		: 1;
+	__u32 bpti		: 1;
+	__u32 bpni		: 1;
+	__u32 reserved6		: 2;
+
+	__u32 d			: 1;
+	__u32 f			: 1;
+	__u32 ic		: 4;
+	__u32 dc		: 4;
+
+	__u64 reserved7;
+	__u64 sf;
+	__u64 rsic;
+	__u64 reserved8;
+} __packed __aligned(8);
+
+static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb)
+{
+	asm volatile(".insn	rsy,0xeb0000000060,0,0,%0"	/* LRIC */
+		: : "Q" (*cb));
+}
+
+static inline void store_runtime_instr_cb(struct runtime_instr_cb *cb)
+{
+	asm volatile(".insn	rsy,0xeb0000000061,0,0,%0"	/* STRIC */
+		: "=Q" (*cb) : : "cc");
+}
+
+#endif /* _S390_UAPI_RUNTIME_INSTR_H */

From 0e1647b39025517e5bfdfd45d6033ab7a1c9c8b9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 25 Jan 2018 14:30:32 +0900
Subject: [PATCH 254/676] s390/kprobes: Fix %p uses in error messages

Remove %p because the kprobe will be dumped in dump_kprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 943d13e90c980..60f60afa645c1 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -281,7 +281,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p)
 		 * is a BUG. The code path resides in the .kprobes.text
 		 * section and is executed with interrupts disabled.
 		 */
-		printk(KERN_EMERG "Invalid kprobe detected at %p.\n", p->addr);
+		pr_err("Invalid kprobe detected.\n");
 		dump_kprobe(p);
 		BUG();
 	}

From 81507f38d427ebed3291a3c153a798937c998140 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Wed, 24 Jan 2018 16:25:23 +0100
Subject: [PATCH 255/676] s390/cpum_cf: correct counter number of
 LAST_HOST_TRANSLATIONS

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/perf_cpum_cf_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 94f90cefbffcb..c5bc3f209652e 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -226,7 +226,7 @@ CPUMF_EVENT_ATTR(cf_z14, L1I_OFFDRAWER_L4_SOURCED_WRITES, 0x00af);
 CPUMF_EVENT_ATTR(cf_z14, BCD_DFP_EXECUTION_SLOTS, 0x00e0);
 CPUMF_EVENT_ATTR(cf_z14, VX_BCD_EXECUTION_SLOTS, 0x00e1);
 CPUMF_EVENT_ATTR(cf_z14, DECIMAL_INSTRUCTIONS, 0x00e2);
-CPUMF_EVENT_ATTR(cf_z14, LAST_HOST_TRANSLATIONS, 0x00e9);
+CPUMF_EVENT_ATTR(cf_z14, LAST_HOST_TRANSLATIONS, 0x00e8);
 CPUMF_EVENT_ATTR(cf_z14, TX_NC_TABORT, 0x00f3);
 CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_NO_SPECIAL, 0x00f4);
 CPUMF_EVENT_ATTR(cf_z14, TX_C_TABORT_SPECIAL, 0x00f5);

From 0537250fdc6c876ed4cbbe874c739aebef493ee2 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Tue, 30 Jan 2018 11:30:11 -0800
Subject: [PATCH 256/676] netfilter: x_tables: make allocation less aggressive

syzbot has noticed that xt_alloc_table_info can allocate a lot of memory.
This is an admin only interface but an admin in a namespace is sufficient
as well.  eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in
xt_alloc_table_info()") has changed the opencoded kmalloc->vmalloc
fallback into kvmalloc.  It has dropped __GFP_NORETRY on the way because
vmalloc has simply never fully supported __GFP_NORETRY semantic.  This is
still the case because e.g.  page tables backing the vmalloc area are
hardcoded GFP_KERNEL.

Revert back to __GFP_NORETRY as a poors man defence against excessively
large allocation request here.  We will not rule out the OOM killer
completely but __GFP_NORETRY should at least stop the large request in
most cases.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: eacd86ca3b03 ("net/netfilter/x_tables.c: use kvmalloc() in xt_alloc_tableLink: http://lkml.kernel.org/r/20180130140104.GE21609@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8fa4d37141a71..2f685ee1f9c87 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1008,7 +1008,12 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	if ((size >> PAGE_SHIFT) + 2 > totalram_pages)
 		return NULL;
 
-	info = kvmalloc(sz, GFP_KERNEL);
+	/* __GFP_NORETRY is not fully supported by kvmalloc but it should
+	 * work reasonably well if sz is too large and bail out rather
+	 * than shoot all processes down before realizing there is nothing
+	 * more to reclaim.
+	 */
+	info = kvmalloc(sz, GFP_KERNEL | __GFP_NORETRY);
 	if (!info)
 		return NULL;
 

From ea23d5e3bf340e413b8e05c13da233c99c64142b Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Wed, 31 Jan 2018 04:50:01 -0700
Subject: [PATCH 257/676] netfilter: ipv6: nf_defrag: Kill frag queue on
 RFC2460 failure

Failures were seen in ICMPv6 fragmentation timeout tests if they were
run after the RFC2460 failure tests. Kernel was not sending out the
ICMPv6 fragment reassembly time exceeded packet after the fragmentation
reassembly timeout of 1 minute had elapsed.

This happened because the frag queue was not released if an error in
IPv6 fragmentation header was detected by RFC2460.

Fixes: 83f1999caeb1 ("netfilter: ipv6: nf_defrag: Pass on packets to stack per RFC2460")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ce53dcfda88a1..b84ce3e6d728a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -264,6 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
+			inet_frag_kill(&fq->q, &nf_frags);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {

From 6be3bcd75afb673a37a82e18ba46d50430f172c1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 31 Jan 2018 18:13:39 +0100
Subject: [PATCH 258/676] netfilter: flowtable infrastructure depends on
 NETFILTER_INGRESS

config NF_FLOW_TABLE depends on NETFILTER_INGRESS. If users forget to
enable this toggle, flowtable registration fails with EOPNOTSUPP.

Moreover, turn 'select NF_FLOW_TABLE' in every flowtable family flavour
into dependency instead, otherwise this new dependency on
NETFILTER_INGRESS causes a warning. This also allows us to remove the
explicit dependency between family flowtables <-> NF_TABLES and
NF_CONNTRACK, given they depend on the NF_FLOW_TABLE core that already
expresses the general dependencies for this new infrastructure.

Moreover, NF_FLOW_TABLE_INET depends on NF_FLOW_TABLE_IPV4 and
NF_FLOWTABLE_IPV6, which already depends on NF_FLOW_TABLE. So we can get
rid of direct dependency with NF_FLOW_TABLE.

In general, let's avoid 'select', it just makes things more complicated.

Reported-by: John Crispin <john@phrozen.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig | 3 +--
 net/ipv6/netfilter/Kconfig | 3 +--
 net/netfilter/Kconfig      | 8 +++++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 5f52236780b4c..dfe6fa4ea5540 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -80,8 +80,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV4
 	tristate "Netfilter flow table IPv4 module"
-	depends on NF_CONNTRACK && NF_TABLES
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv4 support.
 
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4a634b7a2c801..d395d1590699d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -73,8 +73,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_IPV6
 	tristate "Netfilter flow table IPv6 module"
-	depends on NF_CONNTRACK && NF_TABLES
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE
 	help
 	  This option adds the flow table IPv6 support.
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9019fa98003db..d3220b43c832f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -666,8 +666,8 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_INET
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
-	depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6
-	select NF_FLOW_TABLE
+	depends on NF_FLOW_TABLE_IPV4
+	depends on NF_FLOW_TABLE_IPV6
 	help
           This option adds the flow table mixed IPv4/IPv6 support.
 
@@ -675,7 +675,9 @@ config NF_FLOW_TABLE_INET
 
 config NF_FLOW_TABLE
 	tristate "Netfilter flow table module"
-	depends on NF_CONNTRACK && NF_TABLES
+	depends on NETFILTER_INGRESS
+	depends on NF_CONNTRACK
+	depends on NF_TABLES
 	help
 	  This option adds the flow table core infrastructure.
 

From e9829ac4e5fdf3978d761f2f1b13f19a468cac97 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:21 +0100
Subject: [PATCH 259/676] video: fbdev: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/video/fbdev/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 2566cfbdebfbd..f9058c44917d8 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -1156,7 +1156,6 @@ config FB_I810_I2C
 	bool "Enable DDC Support"
 	depends on FB_I810 && FB_I810_GTF
 	select FB_DDC
-	help
 
 config FB_LE80578
 	tristate "Intel LE80578 (Vermilion) support"

From 7a6b7908f119ff4d72220a1bfa980daf2abf4530 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:22 +0100
Subject: [PATCH 260/676] mmc: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/mmc/host/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 67bd3344dd03a..24dee8eb5270c 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -874,7 +874,6 @@ config MMC_CQHCI
 config MMC_TOSHIBA_PCI
 	tristate "Toshiba Type A SD/MMC Card Interface Driver"
 	depends on PCI
-	help
 
 config MMC_BCM2835
 	tristate "Broadcom BCM2835 SDHOST MMC Controller support"

From 1cab370d02dfadcc4b3cc769bd8546a1d58874ca Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:23 +0100
Subject: [PATCH 261/676] Staging: rtl8192u: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/staging/rtl8192u/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/staging/rtl8192u/Kconfig b/drivers/staging/rtl8192u/Kconfig
index 3ee9d0d00fb67..97df6507a4851 100644
--- a/drivers/staging/rtl8192u/Kconfig
+++ b/drivers/staging/rtl8192u/Kconfig
@@ -5,4 +5,3 @@ config RTL8192U
 	select WIRELESS_EXT
 	select WEXT_PRIV
 	select CRYPTO
-	---help---

From 3d77e390a585dfa3ba80f0555247508640a5bbca Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:24 +0100
Subject: [PATCH 262/676] Staging: rtl8192e: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/staging/rtl8192e/rtl8192e/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/staging/rtl8192e/rtl8192e/Kconfig b/drivers/staging/rtl8192e/rtl8192e/Kconfig
index 282e293da18f0..7ac42a590e21b 100644
--- a/drivers/staging/rtl8192e/rtl8192e/Kconfig
+++ b/drivers/staging/rtl8192e/rtl8192e/Kconfig
@@ -6,4 +6,3 @@ config RTL8192E
 	select WEXT_PRIV
 	select CRYPTO
 	select FW_LOADER
-	---help---

From e0371b8be7b058e6418dbf38d7185887db1f8885 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:25 +0100
Subject: [PATCH 263/676] lib/Kconfig.debug: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 lib/Kconfig.debug | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 64d7c19d3167c..3f307692d5af6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -351,7 +351,6 @@ config SECTION_MISMATCH_WARN_ONLY
 #
 config ARCH_WANT_FRAME_POINTERS
 	bool
-	help
 
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"

From edcaa93680ba1a38f397d69f7a69aeeddfa28a15 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:26 +0100
Subject: [PATCH 264/676] MIPS: BCM63XX: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/mips/bcm63xx/boards/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/bcm63xx/boards/Kconfig b/arch/mips/bcm63xx/boards/Kconfig
index 6ff0a74810818..f60d96610ace9 100644
--- a/arch/mips/bcm63xx/boards/Kconfig
+++ b/arch/mips/bcm63xx/boards/Kconfig
@@ -7,6 +7,5 @@ choice
 config BOARD_BCM963XX
        bool "Generic Broadcom 963xx boards"
 	select SSB
-       help
 
 endchoice

From af99adfb09dd6c41b45c24070ccf83e69dd469be Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:27 +0100
Subject: [PATCH 265/676] MIPS: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/mips/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ab98569994f0f..57cd591e7b285 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2326,7 +2326,6 @@ config MIPS_VPE_LOADER_TOM
 config MIPS_VPE_APSP_API
 	bool "Enable support for AP/SP API (RTLX)"
 	depends on MIPS_VPE_LOADER
-	help
 
 config MIPS_VPE_APSP_API_CMP
 	bool

From ca961dde8b51f3c6f1f275f4704d2827446dc7e3 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:28 +0100
Subject: [PATCH 266/676] arm: vt8500: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/arm/mach-vt8500/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/mach-vt8500/Kconfig b/arch/arm/mach-vt8500/Kconfig
index 1156a585dafc9..8841199058ea8 100644
--- a/arch/arm/mach-vt8500/Kconfig
+++ b/arch/arm/mach-vt8500/Kconfig
@@ -13,7 +13,6 @@ config ARCH_WM8505
  	depends on ARCH_MULTI_V5
  	select ARCH_VT8500
  	select CPU_ARM926T
- 	help
 
 config ARCH_WM8750
 	bool "WonderMedia WM8750"

From e1916031369a60c935c6957a47618d304a9df9de Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:29 +0100
Subject: [PATCH 267/676] nios2: kconfig: Remove blank help text

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to remove them, IMO.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Ley Foon Tan <ley.foon.tan@intel.com>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 arch/nios2/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 60fae03dac79a..3d4ec88f1db1c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -152,7 +152,6 @@ menu "Advanced setup"
 
 config ADVANCED_OPTIONS
 	bool "Prompt for advanced kernel configuration options"
-	help
 
 comment "Default settings for advanced configuration options are used"
 	depends on !ADVANCED_OPTIONS

From 1b9eda2e4892cb373c7d4cf23439f357c8739e27 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Wed, 31 Jan 2018 10:34:30 +0100
Subject: [PATCH 268/676] kconfig: Warn if help text is blank

Blank help texts are probably either a typo, a Kconfig misunderstanding,
or some kind of half-committing to adding a help text (in which case a
TODO comment would be clearer, if the help text really can't be added
right away).

Best to flag them, IMO.

Example warning:

	drivers/mmc/host/Kconfig:877: warning: 'MMC_TOSHIBA_PCI' defined with blank help text

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/kconfig/zconf.y | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/kconfig/zconf.y b/scripts/kconfig/zconf.y
index 21ce883e5d9ed..4be98050b961f 100644
--- a/scripts/kconfig/zconf.y
+++ b/scripts/kconfig/zconf.y
@@ -436,6 +436,12 @@ help: help_start T_HELPTEXT
 		zconfprint("warning: '%s' defined with more than one help text -- only the last one will be used",
 			   current_entry->sym->name ?: "<choice>");
 	}
+
+	/* Is the help text empty or all whitespace? */
+	if ($2[strspn($2, " \f\n\r\t\v")] == '\0')
+		zconfprint("warning: '%s' defined with blank help text",
+			   current_entry->sym->name ?: "<choice>");
+
 	current_entry->help = $2;
 };
 

From e89166990f11c3f21e1649d760dd35f9e410321c Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:50 -0700
Subject: [PATCH 269/676] Btrfs: fix deadlock in run_delalloc_nocow

@cur_offset is not set back to what it should be (@cow_start) if
btrfs_next_leaf() returns something wrong, and the range [cow_start,
cur_offset) remains locked forever.

cc: <stable@vger.kernel.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5f31817778bc..a68a4acd16e51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1334,8 +1334,11 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
+			if (ret < 0) {
+				if (cow_start != (u64)-1)
+					cur_offset = cow_start;
 				goto error;
+			}
 			if (ret > 0)
 				break;
 			leaf = path->nodes[0];

From 1846430c24d66e85cc58286b3319c82cd54debb2 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:51 -0700
Subject: [PATCH 270/676] Btrfs: fix crash due to not cleaning up tree log
 block's dirty bits

In cases that the whole fs flips into readonly status due to failures in
critical sections, then log tree's blocks are still dirty, and this leads
to a crash during umount time, the crash is about use-after-free,

umount
 -> close_ctree
    -> stop workers
    -> iput(btree_inode)
       -> iput_final
          -> write_inode_now
	     -> ...
	       -> queue job on stop'd workers

cc: <stable@vger.kernel.org> v3.12+
Fixes: 681ae50917df ("Btrfs: cleanup reserved space when freeing tree log on error")
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ee1aaed1330e8..1920c2149f883 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2471,6 +2471,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 					clean_tree_block(fs_info, next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
+				} else {
+					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+						clear_extent_buffer_dirty(next);
 				}
 
 				WARN_ON(root_owner !=
@@ -2551,6 +2554,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 					clean_tree_block(fs_info, next);
 					btrfs_wait_tree_block_writeback(next);
 					btrfs_tree_unlock(next);
+				} else {
+					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+						clear_extent_buffer_dirty(next);
 				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
@@ -2629,6 +2635,9 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 				clean_tree_block(fs_info, next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
+			} else {
+				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
+					clear_extent_buffer_dirty(next);
 			}
 
 			WARN_ON(log->root_key.objectid !=

From 55237a5f2431a72435e3ed39e4306e973c0446b7 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:52 -0700
Subject: [PATCH 271/676] Btrfs: fix extent state leak from tree log

It's possible that btrfs_sync_log() bails out after one of the two
btrfs_write_marked_extents() which convert extent state's state bit into
EXTENT_NEED_WAIT from EXTENT_DIRTY/EXTENT_NEW, however only EXTENT_DIRTY
and EXTENT_NEW are searched by free_log_tree() so that those extent states
with EXTENT_NEED_WAIT lead to memory leak.

cc: <stable@vger.kernel.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1920c2149f883..79af4ae042ae7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3026,13 +3026,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				0, &start, &end,
+				EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
 				NULL);
 		if (ret)
 			break;
 
 		clear_extent_bits(&log->dirty_log_pages, start, end,
-				  EXTENT_DIRTY | EXTENT_NEW);
+				  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
 	}
 
 	/*

From e8f1bc1493855e32b7a2a019decc3c353d94daf6 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:53 -0700
Subject: [PATCH 272/676] Btrfs: fix btrfs_evict_inode to handle abnormal
 inodes correctly

This regression is introduced in
commit 3d48d9810de4 ("btrfs: Handle uninitialised inode eviction").

There are two problems,

a) it is ->destroy_inode() that does the final free on inode, not
   ->evict_inode(),
b) clear_inode() must be called before ->evict_inode() returns.

This could end up hitting BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
in evict() because I_CLEAR is set in clear_inode().

Fixes: commit 3d48d9810de4 ("btrfs: Handle uninitialised inode eviction")
Cc: <stable@vger.kernel.org> # v4.7-rc6+
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a68a4acd16e51..44a152d8f32f0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5281,7 +5281,7 @@ void btrfs_evict_inode(struct inode *inode)
 	trace_btrfs_inode_evict(inode);
 
 	if (!root) {
-		kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+		clear_inode(inode);
 		return;
 	}
 

From 1a932ef4e47984dee227834667b5ff5a334e4805 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:54 -0700
Subject: [PATCH 273/676] Btrfs: fix use-after-free on root->orphan_block_rsv

I got these from running generic/475,

WARNING: CPU: 0 PID: 26384 at fs/btrfs/inode.c:3326 btrfs_orphan_commit_root+0x1ac/0x2b0 [btrfs]
BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
IP: btrfs_block_rsv_release+0x1c/0x70 [btrfs]
Call Trace:
  btrfs_orphan_release_metadata+0x9f/0x200 [btrfs]
  btrfs_orphan_del+0x10d/0x170 [btrfs]
  btrfs_setattr+0x500/0x640 [btrfs]
  notify_change+0x7ae/0x870
  do_truncate+0xca/0x130
  vfs_truncate+0x2ee/0x3d0
  do_sys_truncate+0xaf/0xf0
  SyS_truncate+0xe/0x10
  entry_SYSCALL_64_fastpath+0x1f/0x96

The race is between btrfs_orphan_commit_root and btrfs_orphan_del,
        t1                                        t2
btrfs_orphan_commit_root                     btrfs_orphan_del
   spin_lock
   check (&root->orphan_inodes)
   root->orphan_block_rsv = NULL;
   spin_unlock
                                             atomic_dec(&root->orphan_inodes);
                                             access root->orphan_block_rsv

Accessing root->orphan_block_rsv must be done before decreasing
root->orphan_inodes.

cc: <stable@vger.kernel.org> v3.12+
Fixes: 703c88e03524 ("Btrfs: fix tracking of orphan inode count")
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44a152d8f32f0..29b491328f4ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3387,6 +3387,11 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
 		ASSERT(!ret);
 		if (ret) {
+			/*
+			 * dec doesn't need spin_lock as ->orphan_block_rsv
+			 * would be released only if ->orphan_inodes is
+			 * zero.
+			 */
 			atomic_dec(&root->orphan_inodes);
 			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 				  &inode->runtime_flags);
@@ -3401,12 +3406,17 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
 		if (ret) {
-			atomic_dec(&root->orphan_inodes);
 			if (reserve) {
 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 					  &inode->runtime_flags);
 				btrfs_orphan_release_metadata(inode);
 			}
+			/*
+			 * btrfs_orphan_commit_root may race with us and set
+			 * ->orphan_block_rsv to zero, in order to avoid that,
+			 * decrease ->orphan_inodes after everything is done.
+			 */
+			atomic_dec(&root->orphan_inodes);
 			if (ret != -EEXIST) {
 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 					  &inode->runtime_flags);
@@ -3438,28 +3448,26 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *root = inode->root;
 	int delete_item = 0;
-	int release_rsv = 0;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 			       &inode->runtime_flags))
 		delete_item = 1;
 
+	if (delete_item && trans)
+		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
+
 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
 			       &inode->runtime_flags))
-		release_rsv = 1;
-	spin_unlock(&root->orphan_lock);
+		btrfs_orphan_release_metadata(inode);
 
-	if (delete_item) {
+	/*
+	 * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
+	 * to zero, in order to avoid that, decrease ->orphan_inodes after
+	 * everything is done.
+	 */
+	if (delete_item)
 		atomic_dec(&root->orphan_inodes);
-		if (trans)
-			ret = btrfs_del_orphan_item(trans, root,
-						    btrfs_ino(inode));
-	}
-
-	if (release_rsv)
-		btrfs_orphan_release_metadata(inode);
 
 	return ret;
 }

From 900c9981680067573671ecc5cbfa7c5770be3a40 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 25 Jan 2018 11:02:56 -0700
Subject: [PATCH 274/676] Btrfs: fix unexpected -EEXIST when creating new inode

The highest objectid, which is assigned to new inode, is decided at
the time of initializing fs roots.  However, in cases where log replay
gets processed, the btree which fs root owns might be changed, so we
have to search it again for the highest objectid, otherwise creating
new inode would end up with -EEXIST.

cc: <stable@vger.kernel.org> v4.4-rc6+
Fixes: f32e48e92596 ("Btrfs: Initialize btrfs_root->highest_objectid when loading tree root and subvolume roots")
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 79af4ae042ae7..61f20c367aafc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -28,6 +28,7 @@
 #include "hash.h"
 #include "compression.h"
 #include "qgroup.h"
+#include "inode-map.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -5685,6 +5686,23 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 						      path);
 		}
 
+		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+			struct btrfs_root *root = wc.replay_dest;
+
+			btrfs_release_path(path);
+
+			/*
+			 * We have just replayed everything, and the highest
+			 * objectid of fs roots probably has changed in case
+			 * some inode_item's got replayed.
+			 *
+			 * root->objectid_mutex is not acquired as log replay
+			 * could only happen during mount.
+			 */
+			ret = btrfs_find_highest_objectid(root,
+						  &root->highest_objectid);
+		}
+
 		key.offset = found_key.offset - 1;
 		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);

From 952bd3db0dada9994fa7edd891178075abcc045d Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 29 Jan 2018 15:53:01 +0200
Subject: [PATCH 275/676] btrfs: Ignore errors from
 btrfs_qgroup_trace_extent_post

Running generic/019 with qgroups on the scratch device enabled is almost
guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's supposed
to trigger only on -ENOMEM, in reality, however, it's possible to get
-EIO from btrfs_qgroup_trace_extent_post. This function just finds the
roots of the extent being tracked and sets the qrecord->old_roots list.
If this operation fails nothing critical happens except the quota
accounting can be considered wrong. In such case just set the
INCONSISTENT flag for the quota and print a warning, rather than killing
off the system. Additionally, it's possible to trigger a BUG_ON in
btrfs_truncate_inode_items as well.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
[ error message adjustments ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 3 ++-
 fs/btrfs/qgroup.c      | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a1a40cf382e39..7ab5e0128f0ce 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -821,7 +821,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	spin_unlock(&delayed_refs->lock);
 
 	if (qrecord_inserted)
-		return btrfs_qgroup_trace_extent_post(fs_info, record);
+		btrfs_qgroup_trace_extent_post(fs_info, record);
+
 	return 0;
 
 free_head_ref:
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9e61dd624f7b5..aa259d6986e1c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1442,8 +1442,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
 	int ret;
 
 	ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
-	if (ret < 0)
-		return ret;
+	if (ret < 0) {
+		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+		btrfs_warn(fs_info,
+"error accounting new delayed refs extent (err code: %d), quota inconsistent",
+			ret);
+		return 0;
+	}
 
 	/*
 	 * Here we don't need to get the lock of

From c8195a7b1ad5648857ce20ba24f384faed8512bc Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Date: Tue, 23 Jan 2018 22:22:09 -0500
Subject: [PATCH 276/676] btrfs: remove spurious WARN_ON(ref->count < 0) in
 find_parent_nodes

Until v4.14, this warning was very infrequent:

	WARNING: CPU: 3 PID: 18172 at fs/btrfs/backref.c:1391 find_parent_nodes+0xc41/0x14e0
	Modules linked in: [...]
	CPU: 3 PID: 18172 Comm: bees Tainted: G      D W    L  4.11.9-zb64+ #1
	Hardware name: System manufacturer System Product Name/M5A78L-M/USB3, BIOS 2101    12/02/2014
	Call Trace:
	 dump_stack+0x85/0xc2
	 __warn+0xd1/0xf0
	 warn_slowpath_null+0x1d/0x20
	 find_parent_nodes+0xc41/0x14e0
	 __btrfs_find_all_roots+0xad/0x120
	 ? extent_same_check_offsets+0x70/0x70
	 iterate_extent_inodes+0x168/0x300
	 iterate_inodes_from_logical+0x87/0xb0
	 ? iterate_inodes_from_logical+0x87/0xb0
	 ? extent_same_check_offsets+0x70/0x70
	 btrfs_ioctl+0x8ac/0x2820
	 ? lock_acquire+0xc2/0x200
	 do_vfs_ioctl+0x91/0x700
	 ? __fget+0x112/0x200
	 SyS_ioctl+0x79/0x90
	 entry_SYSCALL_64_fastpath+0x23/0xc6
	 ? trace_hardirqs_off_caller+0x1f/0x140

Starting with v4.14 (specifically 86d5f9944252 ("btrfs: convert prelimary
reference tracking to use rbtrees")) the WARN_ON occurs three orders of
magnitude more frequently--almost once per second while running workloads
like bees.

Replace the WARN_ON() with a comment rationale for its removal.
The rationale is paraphrased from an explanation by Edmund Nadolski
<enadolski@suse.de> on the linux-btrfs mailing list.

Fixes: 8da6d5815c59 ("Btrfs: added btrfs_find_all_roots()")
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e4054e533f6d4..f94b2d8c744a1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1264,7 +1264,16 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	while (node) {
 		ref = rb_entry(node, struct prelim_ref, rbnode);
 		node = rb_next(&ref->rbnode);
-		WARN_ON(ref->count < 0);
+		/*
+		 * ref->count < 0 can happen here if there are delayed
+		 * refs with a node->action of BTRFS_DROP_DELAYED_REF.
+		 * prelim_ref_insert() relies on this when merging
+		 * identical refs to keep the overall count correct.
+		 * prelim_ref_insert() will merge only those refs
+		 * which compare identically.  Any refs having
+		 * e.g. different offsets would not be merged,
+		 * and would retain their original ref->count < 0.
+		 */
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			if (sc && sc->root_objectid &&
 			    ref->root_id != sc->root_objectid) {

From 627e08738e4315458c5df06358ce7a65cfdd635d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 30 Jan 2018 18:40:22 +0000
Subject: [PATCH 277/676] Btrfs: fix null pointer dereference when replacing
 missing device

When we are replacing a missing device we mount the filesystem with the
degraded mode option in which case we are allowed to have a btrfs device
structure without a backing device member (its bdev member is NULL) and
therefore we can't dereference that member. Commit 38b5f68e9811
("btrfs: drop btrfs_device::can_discard to query directly") started to
dereference that member when discarding extents, resulting in a null
pointer dereference:

 [ 3145.322257] BTRFS warning (device sdf): devid 2 uuid 4d922414-58eb-4880-8fed-9c3840f6c5d5 is missing
 [ 3145.364116] BTRFS info (device sdf): dev_replace from <missing disk> (devid 2) to /dev/sdg started
 [ 3145.413489] BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0
 [ 3145.415085] IP: btrfs_discard_extent+0x6a/0xf8 [btrfs]
 [ 3145.415085] PGD 0 P4D 0
 [ 3145.415085] Oops: 0000 [#1] PREEMPT SMP PTI
 [ 3145.415085] Modules linked in: ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse parport_pc serio_raw i2c_piix4 i2
 [ 3145.415085] CPU: 0 PID: 11989 Comm: btrfs Tainted: G        W        4.15.0-rc9-btrfs-next-55+ #1
 [ 3145.415085] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
 [ 3145.415085] RIP: 0010:btrfs_discard_extent+0x6a/0xf8 [btrfs]
 [ 3145.415085] RSP: 0018:ffffc90004813c60 EFLAGS: 00010293
 [ 3145.415085] RAX: ffff88020d39cc00 RBX: ffff88020c4ea2a0 RCX: 0000000000000002
 [ 3145.415085] RDX: 0000000000000000 RSI: ffff88020c4ea240 RDI: 0000000000000000
 [ 3145.415085] RBP: 0000000000000000 R08: 0000000000004000 R09: 0000000000000000
 [ 3145.415085] R10: ffffc90004813ae8 R11: 0000000000000000 R12: 0000000000000000
 [ 3145.415085] R13: ffff88020c418000 R14: 0000000000000000 R15: 0000000000000000
 [ 3145.415085] FS:  00007f565681f8c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000
 [ 3145.415085] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 3145.415085] CR2: 00000000000000e0 CR3: 000000020d208006 CR4: 00000000001606f0
 [ 3145.415085] Call Trace:
 [ 3145.415085]  btrfs_finish_extent_commit+0x9a/0x1be [btrfs]
 [ 3145.415085]  btrfs_commit_transaction+0x649/0x7a0 [btrfs]
 [ 3145.415085]  ? start_transaction+0x2b0/0x3b3 [btrfs]
 [ 3145.415085]  btrfs_dev_replace_start+0x274/0x30c [btrfs]
 [ 3145.415085]  btrfs_dev_replace_by_ioctl+0x45/0x59 [btrfs]
 [ 3145.415085]  btrfs_ioctl+0x1a91/0x1d62 [btrfs]
 [ 3145.415085]  ? lock_acquire+0x16a/0x1af
 [ 3145.415085]  ? vfs_ioctl+0x1b/0x28
 [ 3145.415085]  ? trace_hardirqs_on_caller+0x14c/0x1a6
 [ 3145.415085]  vfs_ioctl+0x1b/0x28
 [ 3145.415085]  do_vfs_ioctl+0x5a9/0x5e0
 [ 3145.415085]  ? _raw_spin_unlock_irq+0x34/0x46
 [ 3145.415085]  ? entry_SYSCALL_64_fastpath+0x5/0x8b
 [ 3145.415085]  ? trace_hardirqs_on_caller+0x14c/0x1a6
 [ 3145.415085]  SyS_ioctl+0x52/0x76
 [ 3145.415085]  entry_SYSCALL_64_fastpath+0x1e/0x8b
 [ 3145.415085] RIP: 0033:0x7f56558b3c47
 [ 3145.415085] RSP: 002b:00007ffdcfac4c58 EFLAGS: 00000202
 [ 3145.415085] Code: be 02 00 00 00 4c 89 ef e8 b9 e7 03 00 85 c0 89 c5 75 75 48 8b 44 24 08 45 31 f6 48 8d 58 60 eb 52 48 8b 03 48 8b b8 a0 00 00 00 <48> 8b 87 e0 00
 [ 3145.415085] RIP: btrfs_discard_extent+0x6a/0xf8 [btrfs] RSP: ffffc90004813c60
 [ 3145.415085] CR2: 00000000000000e0
 [ 3145.458185] ---[ end trace 06302e7ac31902bf ]---

This is trivially reproduced by running the test btrfs/027 from fstests
like this:

  $ MOUNT_OPTIONS="-o discard" ./check btrfs/027

Fix this by skipping devices without a backing device before attempting
to discard.

Fixes: 38b5f68e9811 ("btrfs: drop btrfs_device::can_discard to query directly")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 05751a677da4f..c1618ab9fecfb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2147,6 +2147,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 			u64 bytes;
 			struct request_queue *req_q;
 
+			if (!stripe->dev->bdev) {
+				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
+				continue;
+			}
 			req_q = bdev_get_queue(stripe->dev->bdev);
 			if (!blk_queue_discard(req_q))
 				continue;

From 8dbfb2bf1bb3848a8069164e205635b2675c24fe Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 20 Dec 2017 16:24:27 -0800
Subject: [PATCH 278/676] KVM: x86: don't forget vcpu_put() in
 kvm_arch_vcpu_ioctl_set_sregs()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to a bad merge resolution between commit f29810335965 ("KVM/x86:
Check input paging mode when cs.l is set") and commit b4ef9d4e8cb8
("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_sregs"),
there is a case in kvm_arch_vcpu_ioctl_set_sregs() where vcpu_put() is
not called after vcpu_get().  Fix it.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e27ee573bd52..07d1c7f66d977 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7706,7 +7706,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 		goto out;
 
 	if (kvm_valid_sregs(vcpu, sregs))
-		return -EINVAL;
+		goto out;
 
 	apic_base_msr.data = sregs->apic_base;
 	apic_base_msr.host_initiated = true;

From ba7cd5d95f25cc6005f687dabdb4e7a6063adda9 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 31 Jan 2018 15:02:47 -0800
Subject: [PATCH 279/676] netfilter: xt_cgroup: initialize info->priv in
 cgroup_mt_check_v1()

xt_cgroup_info_v1->priv is an internal pointer only used for kernel,
we should not trust what user-space provides.

Reported-by: <syzbot+4fbcfcc0d2e6592bd641@syzkaller.appspotmail.com>
Fixes: c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 1db1ce59079fb..891f4e7e8ea7f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -52,6 +52,7 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	}
 
+	info->priv = NULL;
 	if (info->has_path) {
 		cgrp = cgroup_get_from_path(info->path);
 		if (IS_ERR(cgrp)) {

From c7f0030b5b67866c588845abde7bf011de25b98a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:00 +0100
Subject: [PATCH 280/676] netfilter: nft_flow_offload: wait for garbage
 collector to run after cleanup

If netdevice goes down, then flowtable entries are scheduled to be
removed. Wait for garbage collector to have a chance to run so it can
delete them from the hashtable.

The flush call might sleep, so hold the nfnl mutex from
nft_flow_table_iterate() instead of rcu read side lock. The use of the
nfnl mutex is also implicitly fixing races between updates via nfnetlink
and netdevice event.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c    | 8 ++++----
 net/netfilter/nft_flow_offload.c | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0791813a1e7d3..07dd1fac78a88 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5006,13 +5006,13 @@ void nft_flow_table_iterate(struct net *net,
 	struct nft_flowtable *flowtable;
 	const struct nft_table *table;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(table, &net->nft.tables, list) {
-		list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_for_each_entry(table, &net->nft.tables, list) {
+		list_for_each_entry(flowtable, &table->flowtables, list) {
 			iter(&flowtable->data, data);
 		}
 	}
-	rcu_read_unlock();
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
 EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
 
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 4503b8dcf9c04..1739ff8ca21fb 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -208,6 +208,7 @@ static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
 					     void *data)
 {
 	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+	flush_delayed_work(&flowtable->gc_work);
 }
 
 static int flow_offload_netdev_event(struct notifier_block *this,

From 992cfc7c5d105094da7c21c9c74d97ac26bb1e56 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:01 +0100
Subject: [PATCH 281/676] netfilter: nft_flow_offload: no need to flush entries
 on module removal

nft_flow_offload module removal does not require to flush existing
flowtables, it is valid to remove this module while keeping flowtables
around.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_flow_offload.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 1739ff8ca21fb..e5c45c7ac02a1 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -247,14 +247,8 @@ static int __init nft_flow_offload_module_init(void)
 
 static void __exit nft_flow_offload_module_exit(void)
 {
-	struct net *net;
-
 	nft_unregister_expr(&nft_flow_offload_type);
 	unregister_netdevice_notifier(&flow_offload_netdev_notifier);
-	rtnl_lock();
-	for_each_net(net)
-		nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
-	rtnl_unlock();
 }
 
 module_init(nft_flow_offload_module_init);

From 1179e2c27efe21167ec9d882b14becefba2ee990 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 31 Jan 2018 12:34:05 -0500
Subject: [PATCH 282/676] xprtrdma: Fix calculation of ri_max_send_sges

Commit 16f906d66cd7 ("xprtrdma: Reduce required number of send
SGEs") introduced the rpcrdma_ia::ri_max_send_sges field. This fixes
a problem where xprtrdma would not work if the device's max_sge
capability was small (low single digits).

At least RPCRDMA_MIN_SEND_SGES are needed for the inline parts of
each RPC. ri_max_send_sges is set to this value:

  ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;

Then when marshaling each RPC, rpcrdma_args_inline uses that value
to determine whether the device has enough Send SGEs to convey an
NFS WRITE payload inline, or whether instead a Read chunk is
required.

More recently, commit ae72950abf99 ("xprtrdma: Add data structure to
manage RDMA Send arguments") used the ri_max_send_sges value to
calculate the size of an array, but that commit erroneously assumed
ri_max_send_sges contains a value similar to the device's max_sge,
and not one that was reduced by the minimum SGE count.

This assumption results in the calculated size of the sendctx's
Send SGE array to be too small. When the array is used to marshal
an RPC, the code can write Send SGEs into the following sendctx
element in that array, corrupting it. When the device's max_sge is
large, this issue is entirely harmless; but it results in an oops
in the provider's post_send method, if dev.attrs.max_sge is small.

So let's straighten this out: ri_max_send_sges will now contain a
value with the same meaning as dev.attrs.max_sge, which makes
the code easier to understand, and enables rpcrdma_sendctx_create
to calculate the size of the SGE array correctly.

Reported-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Fixes: 16f906d66cd7 ("xprtrdma: Reduce required number of send SGEs")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Cc: stable@vger.kernel.org # v4.10+
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
 net/sunrpc/xprtrdma/verbs.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 162e5dd82466b..f0855a959a278 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -143,7 +143,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
 	if (xdr->page_len) {
 		remaining = xdr->page_len;
 		offset = offset_in_page(xdr->page_base);
-		count = 0;
+		count = RPCRDMA_MIN_SEND_SGES;
 		while (remaining) {
 			remaining -= min_t(unsigned int,
 					   PAGE_SIZE - offset, remaining);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index f4eb63e8e6892..bb56b9d849c4b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -505,7 +505,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 		pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
 		return -ENOMEM;
 	}
-	ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
+	ia->ri_max_send_sges = max_sge;
 
 	if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
 		dprintk("RPC:       %s: insufficient wqe's available\n",

From e89e8d8fcdc6751e86ccad794b052fe67e6ad619 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 31 Jan 2018 12:34:13 -0500
Subject: [PATCH 283/676] xprtrdma: Fix BUG after a device removal

Michal Kalderon reports a BUG that occurs just after device removal:

[  169.112490] rpcrdma: removing device qedr0 for 192.168.110.146:20049
[  169.143909] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[  169.181837] IP: rpcrdma_dma_unmap_regbuf+0xa/0x60 [rpcrdma]

The RPC/RDMA client transport attempts to allocate some resources
on demand. Registered buffers are one such resource. These are
allocated (or re-allocated) by xprt_rdma_allocate to hold RPC Call
and Reply messages. A hardware resource is associated with each of
these buffers, as they can be used for a Send or Receive Work
Request.

If a device is removed from under an NFS/RDMA mount, the transport
layer is responsible for releasing all hardware resources before
the device can be finally unplugged. A BUG results when the NFS
mount hasn't yet seen much activity: the transport tries to release
resources that haven't yet been allocated.

rpcrdma_free_regbuf() already checks for this case, so just move
that check to cover the DEVICE_REMOVAL case as well.

Reported-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Fixes: bebd031866ca ("xprtrdma: Support unplugging an HCA ...")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index bb56b9d849c4b..e6f84a6434a04 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1502,6 +1502,9 @@ __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 static void
 rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
 {
+	if (!rb)
+		return;
+
 	if (!rpcrdma_regbuf_is_mapped(rb))
 		return;
 
@@ -1517,9 +1520,6 @@ rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
 void
 rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
 {
-	if (!rb)
-		return;
-
 	rpcrdma_dma_unmap_regbuf(rb);
 	kfree(rb);
 }

From 97e45dd6e82a556e64a81824bb87c2b9bf98ddd3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 23 Jan 2018 09:59:20 -0600
Subject: [PATCH 284/676] ACPI / video: Use true for boolean value

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_video.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index f53ccc6802381..76fb96966f7b1 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -53,7 +53,7 @@ MODULE_AUTHOR("Bruno Ducrot");
 MODULE_DESCRIPTION("ACPI Video Driver");
 MODULE_LICENSE("GPL");
 
-static bool brightness_switch_enabled = 1;
+static bool brightness_switch_enabled = true;
 module_param(brightness_switch_enabled, bool, 0644);
 
 /*

From 0eedae8e5f983aba6fb5c29c8699ecf03c7e50e0 Mon Sep 17 00:00:00 2001
From: Anuj Mittal <anuj.mittal@intel.com>
Date: Wed, 24 Jan 2018 15:42:29 -0800
Subject: [PATCH 285/676] ACPICA: Linux: add support for X32 ABI compilation

X32 follows ILP32 model. Check for ILP32 as well when checking for
x86_64 to ensure the defines are correct for X32 ABI.

Signed-off-by: Anuj Mittal <anuj.mittal@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/platform/aclinux.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 1b473efd9eb60..97e088276c6d2 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -206,7 +206,7 @@
 #define ACPI_FLUSH_CPU_CACHE()
 #define ACPI_CAST_PTHREAD_T(pthread) ((acpi_thread_id) (pthread))
 
-#if defined(__ia64__)    || defined(__x86_64__) ||\
+#if defined(__ia64__)    || (defined(__x86_64__) && !defined(__ILP32__)) ||\
 	defined(__aarch64__) || defined(__PPC64__) ||\
 	defined(__s390x__)
 #define ACPI_MACHINE_WIDTH          64

From 761f0b82393353507930b6721ae4311a9df2ca36 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 24 Jan 2018 15:42:30 -0800
Subject: [PATCH 286/676] ACPICA: Avoid NULL pointer arithmetic

We should not assume NULL is defined as "(void *)0" because NULL is
an implementation-defined macro.  Especially, Clang 6 complains about
it, i.e., "arithmetic on a null pointer treated as a cast from integer
to pointer is a GNU extension".

Signed-off-by: Jung-uk Kim <jkim@free_BSD.org>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actypes.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 31f1be74dd165..f3999ae7a43f1 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -485,7 +485,7 @@ typedef u8 acpi_owner_id;
 /*
  * Constants with special meanings
  */
-#define ACPI_ROOT_OBJECT                ACPI_ADD_PTR (acpi_handle, NULL, ACPI_MAX_PTR)
+#define ACPI_ROOT_OBJECT                ACPI_ADD_PTR (acpi_handle, (void *) 0, ACPI_MAX_PTR)
 #define ACPI_WAIT_FOREVER               0xFFFF	/* u16, as per ACPI spec */
 #define ACPI_DO_NOT_WAIT                0
 
@@ -536,9 +536,9 @@ typedef u64 acpi_integer;
 
 /* Pointer/Integer type conversions */
 
-#define ACPI_TO_POINTER(i)              ACPI_ADD_PTR (void, (void *) NULL,(acpi_size) i)
-#define ACPI_TO_INTEGER(p)              ACPI_PTR_DIFF (p, (void *) NULL)
-#define ACPI_OFFSET(d, f)               ACPI_PTR_DIFF (&(((d *) 0)->f), (void *) NULL)
+#define ACPI_TO_POINTER(i)              ACPI_ADD_PTR (void, (void *) 0, (acpi_size) (i))
+#define ACPI_TO_INTEGER(p)              ACPI_PTR_DIFF (p, (void *) 0)
+#define ACPI_OFFSET(d, f)               ACPI_PTR_DIFF (&(((d *) 0)->f), (void *) 0)
 #define ACPI_PHYSADDR_TO_PTR(i)         ACPI_TO_POINTER(i)
 #define ACPI_PTR_TO_PHYSADDR(i)         ACPI_TO_INTEGER(i)
 

From 9cf02a0a66da50a990604533fa7d2dbf74aae054 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 24 Jan 2018 15:42:31 -0800
Subject: [PATCH 287/676] ACPICA: Prefer ACPI_TO_POINTER() over ACPI_ADD_PTR()

This is more easy to read.

Signed-off-by: Jung-uk Kim <jkim@free_BSD.org>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index f3999ae7a43f1..0b6b55f1e597f 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -485,7 +485,7 @@ typedef u8 acpi_owner_id;
 /*
  * Constants with special meanings
  */
-#define ACPI_ROOT_OBJECT                ACPI_ADD_PTR (acpi_handle, (void *) 0, ACPI_MAX_PTR)
+#define ACPI_ROOT_OBJECT                ((acpi_handle) ACPI_TO_POINTER (ACPI_MAX_PTR))
 #define ACPI_WAIT_FOREVER               0xFFFF	/* u16, as per ACPI spec */
 #define ACPI_DO_NOT_WAIT                0
 

From 0649a0989fcb02ecaf7b0c15ae6a5051637c8f34 Mon Sep 17 00:00:00 2001
From: Jung-uk Kim <jkim@FreeBSD.org>
Date: Wed, 24 Jan 2018 15:42:32 -0800
Subject: [PATCH 288/676] ACPICA: Add a missing pair of parentheses

Signed-off-by: Jung-uk Kim <jkim@free_BSD.org>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actypes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 0b6b55f1e597f..4fed3310235a5 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -532,7 +532,7 @@ typedef u64 acpi_integer;
 #define ACPI_CAST_INDIRECT_PTR(t, p)    ((t **) (acpi_uintptr_t) (p))
 #define ACPI_ADD_PTR(t, a, b)           ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) + (acpi_size)(b)))
 #define ACPI_SUB_PTR(t, a, b)           ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) - (acpi_size)(b)))
-#define ACPI_PTR_DIFF(a, b)             (acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b)))
+#define ACPI_PTR_DIFF(a, b)             ((acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b))))
 
 /* Pointer/Integer type conversions */
 

From 5a98231f554779fb1649f477b70bed1fd17222f2 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 26 Jan 2018 16:02:56 +0100
Subject: [PATCH 289/676] ACPI: export acpi_bus_get_status_handle()

Some modular drivers need this, export it.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 4d0979e02a287..d44d2c287b918 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -108,6 +108,7 @@ acpi_status acpi_bus_get_status_handle(acpi_handle handle,
 	}
 	return status;
 }
+EXPORT_SYMBOL_GPL(acpi_bus_get_status_handle);
 
 int acpi_bus_get_status(struct acpi_device *device)
 {

From b0fd6772cfe10cd1edd65a376522e430e5f1bc58 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 26 Jan 2018 16:02:57 +0100
Subject: [PATCH 290/676] PCI: acpiphp_ibm: prepare for acpi_get_object_info()
 no longer returning status

acpi_get_object_info() is intended for early probe usage and as such should
not call any methods which may rely on OpRegions, but it used to also call
_STA to get the status, which on some systems does rely on OpRegions, this
behavior and the acpi_device_info.current_status member are being removed.

This commit prepares the acpiphp_ibm code for this by having it get the
status itself using acpi_bus_get_status_handle(). Note no error handling is
necessary on any errors acpi_bus_get_status_handle() leaves the value of
the passed in current_status at its 0 initialization value.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pci/hotplug/acpiphp_ibm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index 984c7e8cec5ae..8472c4a27f700 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -399,6 +399,7 @@ static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
 		u32 lvl, void *context, void **rv)
 {
 	acpi_handle *phandle = (acpi_handle *)context;
+	unsigned long long current_status = 0;
 	acpi_status status;
 	struct acpi_device_info *info;
 	int retval = 0;
@@ -410,7 +411,9 @@ static acpi_status __init ibm_find_acpi_device(acpi_handle handle,
 		return retval;
 	}
 
-	if (info->current_status && (info->valid & ACPI_VALID_HID) &&
+	acpi_bus_get_status_handle(handle, &current_status);
+
+	if (current_status && (info->valid & ACPI_VALID_HID) &&
 			(!strcmp(info->hardware_id.string, IBM_HARDWARE_ID1) ||
 			 !strcmp(info->hardware_id.string, IBM_HARDWARE_ID2))) {
 		pr_debug("found hardware: %s, handle: %p\n",

From 54ddce7062242036402242242c07c60c0b505f84 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 26 Jan 2018 16:02:58 +0100
Subject: [PATCH 291/676] ACPI / bus: Do not call _STA on battery devices with
 unmet dependencies

The battery code uses acpi_device->dep_unmet to check for unmet deps and
if there are unmet deps it does not bind to the device to avoid errors
about missing OpRegions when calling ACPI methods on the device.

The missing OpRegions when there are unmet deps problem also applies to
the _STA method of some battery devices and calling it too early results
in errors like these:

[    0.123579] ACPI Error: No handler for Region [ECRM] (00000000ba9edc4c)
               [GenericSerialBus] (20170831/evregion-166)
[    0.123601] ACPI Error: Region GenericSerialBus (ID=9) has no handler
               (20170831/exfldio-299)
[    0.123618] ACPI Error: Method parse/execution failed
               \_SB.I2C1.BAT1._STA, AE_NOT_EXIST (20170831/psparse-550)

This commit fixes these errors happening when acpi_get_bus_status gets
called by checking dep_unmet for battery devices and reporting a status
of 0 until all dependencies are met.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index d44d2c287b918..433bb5a3f3277 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -120,6 +120,12 @@ int acpi_bus_get_status(struct acpi_device *device)
 		return 0;
 	}
 
+	/* Battery devices must have their deps met before calling _STA */
+	if (acpi_device_is_battery(device) && device->dep_unmet) {
+		acpi_set_device_status(device, 0);
+		return 0;
+	}
+
 	status = acpi_bus_get_status_handle(device->handle, &sta);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;

From 63347db0affadcbccd5613116ea8431c70139b3e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 26 Jan 2018 16:02:59 +0100
Subject: [PATCH 292/676] ACPI / scan: Use acpi_bus_get_status() to initialize
 ACPI_TYPE_DEVICE devs

The acpi_get_bus_status wrapper for acpi_bus_get_status_handle has some
code to handle certain device quirks, in some cases we also need this
quirk handling for the initial _STA call.

Specifically on some devices calling _STA before all _DEP dependencies
are met results in errors like these:

[    0.123579] ACPI Error: No handler for Region [ECRM] (00000000ba9edc4c)
               [GenericSerialBus] (20170831/evregion-166)
[    0.123601] ACPI Error: Region GenericSerialBus (ID=9) has no handler
               (20170831/exfldio-299)
[    0.123618] ACPI Error: Method parse/execution failed
               \_SB.I2C1.BAT1._STA, AE_NOT_EXIST (20170831/psparse-550)

acpi_get_bus_status already has code to avoid this, so by using it we
also silence these errors from the initial _STA call.

Note that in order for the acpi_get_bus_status handling for this to work,
we initialize dep_unmet to 1 until acpi_device_dep_initialize gets called,
this means that battery devices will be instantiated with an initial
status of 0. This is not a problem, acpi_bus_attach will get called soon
after the instantiation anyways and it will update the status as first
point of order.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index b0fe5272c76aa..8e63d937babb0 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1565,6 +1565,8 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	device_initialize(&device->dev);
 	dev_set_uevent_suppress(&device->dev, true);
 	acpi_init_coherency(device);
+	/* Assume there are unmet deps until acpi_device_dep_initialize() runs */
+	device->dep_unmet = 1;
 }
 
 void acpi_device_add_finalize(struct acpi_device *device)
@@ -1588,6 +1590,14 @@ static int acpi_add_single_object(struct acpi_device **child,
 	}
 
 	acpi_init_device_object(device, handle, type, sta);
+	/*
+	 * For ACPI_BUS_TYPE_DEVICE getting the status is delayed till here so
+	 * that we can call acpi_bus_get_status() and use its quirk handling.
+	 * Note this must be done before the get power-/wakeup_dev-flags calls.
+	 */
+	if (type == ACPI_BUS_TYPE_DEVICE)
+		acpi_bus_get_status(device);
+
 	acpi_bus_get_power_flags(device);
 	acpi_bus_get_wakeup_device_flags(device);
 
@@ -1660,9 +1670,11 @@ static int acpi_bus_type_and_status(acpi_handle handle, int *type,
 			return -ENODEV;
 
 		*type = ACPI_BUS_TYPE_DEVICE;
-		status = acpi_bus_get_status_handle(handle, sta);
-		if (ACPI_FAILURE(status))
-			*sta = 0;
+		/*
+		 * acpi_add_single_object updates this once we've an acpi_device
+		 * so that acpi_bus_get_status' quirk handling can be used.
+		 */
+		*sta = 0;
 		break;
 	case ACPI_TYPE_PROCESSOR:
 		*type = ACPI_BUS_TYPE_PROCESSOR;
@@ -1760,6 +1772,8 @@ static void acpi_device_dep_initialize(struct acpi_device *adev)
 	acpi_status status;
 	int i;
 
+	adev->dep_unmet = 0;
+
 	if (!acpi_has_method(adev->handle, "_DEP"))
 		return;
 

From ba1edb9a5125a617d612f98eead14b9b84e75c3a Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 29 Jan 2018 10:26:46 +0800
Subject: [PATCH 293/676] ACPI: processor_perflib: Do not send _PPC change
 notification if not ready

The following warning was triggered after resumed from S3 -
if all the nonboot CPUs were put offline before suspend:

[ 1840.329515] unchecked MSR access error: RDMSR from 0x771 at rIP: 0xffffffff86061e3a (native_read_msr+0xa/0x30)
[ 1840.329516] Call Trace:
[ 1840.329521]  __rdmsr_on_cpu+0x33/0x50
[ 1840.329525]  generic_exec_single+0x81/0xb0
[ 1840.329527]  smp_call_function_single+0xd2/0x100
[ 1840.329530]  ? acpi_ds_result_pop+0xdd/0xf2
[ 1840.329532]  ? acpi_ds_create_operand+0x215/0x23c
[ 1840.329534]  rdmsrl_on_cpu+0x57/0x80
[ 1840.329536]  ? cpumask_next+0x1b/0x20
[ 1840.329538]  ? rdmsrl_on_cpu+0x57/0x80
[ 1840.329541]  intel_pstate_update_perf_limits+0xf3/0x220
[ 1840.329544]  ? notifier_call_chain+0x4a/0x70
[ 1840.329546]  intel_pstate_set_policy+0x4e/0x150
[ 1840.329548]  cpufreq_set_policy+0xcd/0x2f0
[ 1840.329550]  cpufreq_update_policy+0xb2/0x130
[ 1840.329552]  ? cpufreq_update_policy+0x130/0x130
[ 1840.329556]  acpi_processor_ppc_has_changed+0x65/0x80
[ 1840.329558]  acpi_processor_notify+0x80/0x100
[ 1840.329561]  acpi_ev_notify_dispatch+0x44/0x5c
[ 1840.329563]  acpi_os_execute_deferred+0x14/0x20
[ 1840.329565]  process_one_work+0x193/0x3c0
[ 1840.329567]  worker_thread+0x35/0x3b0
[ 1840.329569]  kthread+0x125/0x140
[ 1840.329571]  ? process_one_work+0x3c0/0x3c0
[ 1840.329572]  ? kthread_park+0x60/0x60
[ 1840.329575]  ? do_syscall_64+0x67/0x180
[ 1840.329577]  ret_from_fork+0x25/0x30
[ 1840.329585] unchecked MSR access error: WRMSR to 0x774 (tried to write 0x0000000000000000) at rIP: 0xffffffff86061f78 (native_write_msr+0x8/0x30)
[ 1840.329586] Call Trace:
[ 1840.329587]  __wrmsr_on_cpu+0x37/0x40
[ 1840.329589]  generic_exec_single+0x81/0xb0
[ 1840.329592]  smp_call_function_single+0xd2/0x100
[ 1840.329594]  ? acpi_ds_create_operand+0x215/0x23c
[ 1840.329595]  ? cpumask_next+0x1b/0x20
[ 1840.329597]  wrmsrl_on_cpu+0x57/0x70
[ 1840.329598]  ? rdmsrl_on_cpu+0x57/0x80
[ 1840.329599]  ? wrmsrl_on_cpu+0x57/0x70
[ 1840.329602]  intel_pstate_hwp_set+0xd3/0x150
[ 1840.329604]  intel_pstate_set_policy+0x119/0x150
[ 1840.329606]  cpufreq_set_policy+0xcd/0x2f0
[ 1840.329607]  cpufreq_update_policy+0xb2/0x130
[ 1840.329610]  ? cpufreq_update_policy+0x130/0x130
[ 1840.329613]  acpi_processor_ppc_has_changed+0x65/0x80
[ 1840.329615]  acpi_processor_notify+0x80/0x100
[ 1840.329617]  acpi_ev_notify_dispatch+0x44/0x5c
[ 1840.329619]  acpi_os_execute_deferred+0x14/0x20
[ 1840.329620]  process_one_work+0x193/0x3c0
[ 1840.329622]  worker_thread+0x35/0x3b0
[ 1840.329624]  kthread+0x125/0x140
[ 1840.329625]  ? process_one_work+0x3c0/0x3c0
[ 1840.329626]  ? kthread_park+0x60/0x60
[ 1840.329628]  ? do_syscall_64+0x67/0x180
[ 1840.329631]  ret_from_fork+0x25/0x30

This is because if there's only one online CPU, the MSR_PM_ENABLE
(package wide)can not be enabled after resumed, due to
intel_pstate_hwp_enable() will only be invoked on AP's online
process after resumed - if there's no AP online, the HWP remains
disabled after resumed (BIOS has disabled it in S3). Then if
there comes a _PPC change notification which touches HWP register
during this stage, the warning is triggered.

Since we don't call acpi_processor_register_performance() when
HWP is enabled, the pr->performance will be NULL. When this is
NULL we don't need to do _PPC change notification.

Reported-by: Doug Smythies <dsmythies@telus.net>
Suggested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Yu Chen <yu.c.chen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_perflib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 18b72eec35076..c7cf48ad5cb9d 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -159,7 +159,7 @@ void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 {
 	int ret;
 
-	if (ignore_ppc) {
+	if (ignore_ppc || !pr->performance) {
 		/*
 		 * Only when it is notification event, the _OST object
 		 * will be evaluated. Otherwise it is skipped.

From 4446823e2573e97b26f0534ff8db9e9c4cb65840 Mon Sep 17 00:00:00 2001
From: Kai Heng Feng <kai.heng.feng@canonical.com>
Date: Mon, 29 Jan 2018 13:40:36 +0800
Subject: [PATCH 294/676] ACPI / battery: Add quirk for Asus UX360UA and
 UX410UAK

Same issue as other Asus laptops, ACPI incorrectly reports discharging
when battery is full and AC is plugged.

Use the same battery quirk can workaround the issue.

Link: https://bugs.launchpad.net/bugs/1661876
Link: https://bugs.launchpad.net/bugs/1745032
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/battery.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 19bc440820e64..7128488a3a728 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -1209,6 +1209,22 @@ static const struct dmi_system_id bat_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "UX305LA"),
 		},
 	},
+	{
+		.callback = battery_full_discharging_quirk,
+		.ident = "ASUS UX360UA",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "UX360UA"),
+		},
+	},
+	{
+		.callback = battery_full_discharging_quirk,
+		.ident = "ASUS UX410UAK",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "UX410UAK"),
+		},
+	},
 	{},
 };
 

From 248e8841b4fbb07048f1eb4f63b74bd3c4cf91f2 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Mon, 29 Jan 2018 14:22:49 -0600
Subject: [PATCH 295/676] ACPI / processor: Set default C1 idle state
 description

The ACPI idle driver will default to ACPI_CSTATE_HALT for C1 if a _CST
object for C1 is not defined. However, the description will not be set,
so users will see "<null>" when reading the description from sysfs.

Set the C1 state description when defaulting to ACPI_CSTATE_HALT.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_idle.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d50a7b6ccddd9..226dc5665d810 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -291,6 +291,9 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
 		pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
 		pr->power.states[ACPI_STATE_C1].valid = 1;
 		pr->power.states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_HALT;
+
+		snprintf(pr->power.states[ACPI_STATE_C1].desc,
+			 ACPI_CX_DESC_LEN, "ACPI HLT");
 	}
 	/* the C0 state only exists as a filler in our array */
 	pr->power.states[ACPI_STATE_C0].valid = 1;

From 09584b406742413ac4c8d7e030374d4daa045b69 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 2 Feb 2018 22:37:15 -0800
Subject: [PATCH 296/676] bpf: fix selftests/bpf test_kmod.sh failure when
 CONFIG_BPF_JIT_ALWAYS_ON=y

With CONFIG_BPF_JIT_ALWAYS_ON is defined in the config file,
tools/testing/selftests/bpf/test_kmod.sh failed like below:
  [root@localhost bpf]# ./test_kmod.sh
  sysctl: setting key "net.core.bpf_jit_enable": Invalid argument
  [ JIT enabled:0 hardened:0 ]
  [  132.175681] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  132.458834] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:0 ]
  [  133.456025] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  133.730935] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:1 ]
  [  134.769730] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  135.050864] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [ JIT enabled:1 hardened:2 ]
  [  136.442882] test_bpf: #297 BPF_MAXINSNS: Jump, gap, jump, ... FAIL to prog_create err=-524 len=4096
  [  136.821810] test_bpf: Summary: 348 PASSED, 1 FAILED, [340/340 JIT'ed]
  [root@localhost bpf]#

The test_kmod.sh load/remove test_bpf.ko multiple times with different
settings for sysctl net.core.bpf_jit_{enable,harden}. The failed test #297
of test_bpf.ko is designed such that JIT always fails.

Commit 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config)
introduced the following tightening logic:
    ...
        if (!bpf_prog_is_dev_bound(fp->aux)) {
                fp = bpf_int_jit_compile(fp);
    #ifdef CONFIG_BPF_JIT_ALWAYS_ON
                if (!fp->jited) {
                        *err = -ENOTSUPP;
                        return fp;
                }
    #endif
    ...
With this logic, Test #297 always gets return value -ENOTSUPP
when CONFIG_BPF_JIT_ALWAYS_ON is defined, causing the test failure.

This patch fixed the failure by marking Test #297 as expected failure
when CONFIG_BPF_JIT_ALWAYS_ON is defined.

Fixes: 290af86629b2 (bpf: introduce BPF_JIT_ALWAYS_ON config)
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 lib/test_bpf.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 4cd9ea9b3449f..b4e22345963f3 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -83,6 +83,7 @@ struct bpf_test {
 		__u32 result;
 	} test[MAX_SUBTESTS];
 	int (*fill_helper)(struct bpf_test *self);
+	int expected_errcode; /* used when FLAG_EXPECTED_FAIL is set in the aux */
 	__u8 frag_data[MAX_DATA];
 	int stack_depth; /* for eBPF only, since tests don't call verifier */
 };
@@ -2026,7 +2027,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: div_k_0",
@@ -2036,7 +2039,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: unknown insn",
@@ -2047,7 +2052,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: out of range spill/fill",
@@ -2057,7 +2064,9 @@ static struct bpf_test tests[] = {
 		},
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
-		{ }
+		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"JUMPS + HOLES",
@@ -2149,6 +2158,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{
 		"check: LDX + RET X",
@@ -2159,6 +2170,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Mainly checking JIT here. */
 		"M[]: alt STX + LDX",
@@ -2333,6 +2346,8 @@ static struct bpf_test tests[] = {
 		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
 		{ },
 		{ },
+		.fill_helper = NULL,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Passes checker but fails during runtime. */
 		"LD [SKF_AD_OFF-1]",
@@ -5395,6 +5410,7 @@ static struct bpf_test tests[] = {
 		{ },
 		{ },
 		.fill_helper = bpf_fill_maxinsns4,
+		.expected_errcode = -EINVAL,
 	},
 	{	/* Mainly checking JIT here. */
 		"BPF_MAXINSNS: Very long jump",
@@ -5450,10 +5466,15 @@ static struct bpf_test tests[] = {
 	{
 		"BPF_MAXINSNS: Jump, gap, jump, ...",
 		{ },
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+		CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
+#else
 		CLASSIC | FLAG_NO_DATA,
+#endif
 		{ },
 		{ { 0, 0xababcbac } },
 		.fill_helper = bpf_fill_maxinsns11,
+		.expected_errcode = -ENOTSUPP,
 	},
 	{
 		"BPF_MAXINSNS: ld_abs+get_processor_id",
@@ -6344,7 +6365,7 @@ static struct bpf_prog *generate_filter(int which, int *err)
 
 		*err = bpf_prog_create(&fp, &fprog);
 		if (tests[which].aux & FLAG_EXPECTED_FAIL) {
-			if (*err == -EINVAL) {
+			if (*err == tests[which].expected_errcode) {
 				pr_cont("PASS\n");
 				/* Verifier rejected filter as expected. */
 				*err = 0;

From 6587553031d33f3abbf33b7431a66f0c881d0def Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 27 Jan 2018 12:40:13 +0100
Subject: [PATCH 297/676] s390/runtime_instrumentation: re-add signum system
 call parameter

Add the signum system call parameter for documentation purposes only,
and without checking if the passed value is a valid real-time signal.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/runtime_instr.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 09f5bf0d5c0c8..125c7f6e87150 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -18,6 +18,8 @@
 #include <asm/cpu_mf.h>
 #include <asm/irq.h>
 
+#include "entry.h"
+
 /* empty control block to disable RI by loading it */
 struct runtime_instr_cb runtime_instr_empty_cb;
 
@@ -59,7 +61,13 @@ static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
 	cb->v = 1;
 }
 
-SYSCALL_DEFINE1(s390_runtime_instr, int, command)
+/*
+ * The signum argument is unused. In older kernels it was used to
+ * specify a real-time signal. For backwards compatibility user space
+ * should pass a valid real-time signal number (the signum argument
+ * was checked in older kernels).
+ */
+SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum)
 {
 	struct runtime_instr_cb *cb;
 

From 364e3f90f8e9198681ccccad4bea2f681a8ffba2 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Mon, 29 Jan 2018 12:55:29 +0100
Subject: [PATCH 298/676] s390/cio: fix kernel-doc usage

Fix the kernel-doc usage in cio to get rid of (W=1) build warnings like:
drivers/s390/cio/cio.c:1068: warning: No description found for parameter 'sch'

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/chp.c         | 10 +++++-----
 drivers/s390/cio/cio.c         |  2 +-
 drivers/s390/cio/itcw.c        |  2 +-
 drivers/s390/cio/qdio_main.c   |  4 +---
 drivers/s390/cio/vfio_ccw_cp.c |  2 +-
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 5c94a3aec4dd2..f95b452b8bbcc 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -412,7 +412,7 @@ static void chp_release(struct device *dev)
 
 /**
  * chp_update_desc - update channel-path description
- * @chp - channel-path
+ * @chp: channel-path
  *
  * Update the channel-path description of the specified channel-path
  * including channel measurement related information.
@@ -438,7 +438,7 @@ int chp_update_desc(struct channel_path *chp)
 
 /**
  * chp_new - register a new channel-path
- * @chpid - channel-path ID
+ * @chpid: channel-path ID
  *
  * Create and register data structure representing new channel-path. Return
  * zero on success, non-zero otherwise.
@@ -730,8 +730,8 @@ static void cfg_func(struct work_struct *work)
 
 /**
  * chp_cfg_schedule - schedule chpid configuration request
- * @chpid - channel-path ID
- * @configure - Non-zero for configure, zero for deconfigure
+ * @chpid: channel-path ID
+ * @configure: Non-zero for configure, zero for deconfigure
  *
  * Schedule a channel-path configuration/deconfiguration request.
  */
@@ -747,7 +747,7 @@ void chp_cfg_schedule(struct chp_id chpid, int configure)
 
 /**
  * chp_cfg_cancel_deconfigure - cancel chpid deconfiguration request
- * @chpid - channel-path ID
+ * @chpid: channel-path ID
  *
  * Cancel an active channel-path deconfiguration request if it has not yet
  * been performed.
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 987bf9a8c9f72..6886b3d34cf84 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -1059,7 +1059,7 @@ EXPORT_SYMBOL_GPL(cio_tm_start_key);
 
 /**
  * cio_tm_intrg - perform interrogate function
- * @sch - subchannel on which to perform the interrogate function
+ * @sch: subchannel on which to perform the interrogate function
  *
  * If the specified subchannel is running in transport-mode, perform the
  * interrogate function. Return zero on success, non-zero otherwie.
diff --git a/drivers/s390/cio/itcw.c b/drivers/s390/cio/itcw.c
index deaf59f93326f..19e46363348cc 100644
--- a/drivers/s390/cio/itcw.c
+++ b/drivers/s390/cio/itcw.c
@@ -15,7 +15,7 @@
 #include <asm/fcx.h>
 #include <asm/itcw.h>
 
-/**
+/*
  * struct itcw - incremental tcw helper data type
  *
  * This structure serves as a handle for the incremental construction of a
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 95b0efe28afb5..d5b02de02a3af 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -72,6 +72,7 @@ static inline int do_siga_input(unsigned long schid, unsigned int mask,
  * @mask: which output queues to process
  * @bb: busy bit indicator, set only if SIGA-w/wt could not access a buffer
  * @fc: function code to perform
+ * @aob: asynchronous operation block
  *
  * Returns condition code.
  * Note: For IQDC unicast queues only the highest priority queue is processed.
@@ -1761,9 +1762,6 @@ EXPORT_SYMBOL(qdio_stop_irq);
  * @response:		Response code will be stored at this address
  * @cb: 		Callback function will be executed for each element
  *			of the address list
- * @priv:		Pointer passed from the caller to qdio_pnso_brinfo()
- * @type:		Type of the address entry passed to the callback
- * @entry:		Entry containg the address of the specified type
  * @priv:		Pointer to pass to the callback function.
  *
  * Performs "Store-network-bridging-information list" operation and calls
diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c
index d9a2fffd034be..2c7550797ec2f 100644
--- a/drivers/s390/cio/vfio_ccw_cp.c
+++ b/drivers/s390/cio/vfio_ccw_cp.c
@@ -835,7 +835,7 @@ void cp_update_scsw(struct channel_program *cp, union scsw *scsw)
 
 /**
  * cp_iova_pinned() - check if an iova is pinned for a ccw chain.
- * @cmd: ccwchain command on which to perform the operation
+ * @cp: channel_program on which to perform the operation
  * @iova: the iova to check
  *
  * If the @iova is currently pinned for the ccw chain, return true;

From 7041d28115e91f2144f811ffe8a195c696b1e1d0 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 16 Jan 2018 13:27:30 +0100
Subject: [PATCH 299/676] s390: scrub registers on kernel entry and KVM exit

Clear all user space registers on entry to the kernel and all KVM guest
registers on KVM guest exit if the register does not contain either a
parameter or a result value.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 6cd444d25545c..5d87eda605f22 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -248,6 +248,12 @@ ENTRY(sie64a)
 sie_exit:
 	lg	%r14,__SF_EMPTY+8(%r15)		# load guest register save area
 	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
+	xgr	%r0,%r0				# clear guest registers to
+	xgr	%r1,%r1				# prevent speculative use
+	xgr	%r2,%r2
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	xgr	%r5,%r5
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
 	lg	%r2,__SF_EMPTY+16(%r15)		# return exit reason code
 	br	%r14
@@ -282,6 +288,8 @@ ENTRY(system_call)
 .Lsysc_vtime:
 	UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
 	stmg	%r0,%r7,__PT_R0(%r11)
+	# clear user controlled register to prevent speculative use
+	xgr	%r0,%r0
 	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
 	mvc	__PT_PSW(16,%r11),__LC_SVC_OLD_PSW
 	mvc	__PT_INT_CODE(4,%r11),__LC_SVC_ILC
@@ -561,6 +569,15 @@ ENTRY(pgm_check_handler)
 4:	lgr	%r13,%r11
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	stmg	%r0,%r7,__PT_R0(%r11)
+	# clear user controlled registers to prevent speculative use
+	xgr	%r0,%r0
+	xgr	%r1,%r1
+	xgr	%r2,%r2
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	xgr	%r5,%r5
+	xgr	%r6,%r6
+	xgr	%r7,%r7
 	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_SYNC
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	mvc	__PT_INT_CODE(4,%r11),__LC_PGM_ILC
@@ -626,6 +643,16 @@ ENTRY(io_int_handler)
 	lmg	%r8,%r9,__LC_IO_OLD_PSW
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
 	stmg	%r0,%r7,__PT_R0(%r11)
+	# clear user controlled registers to prevent speculative use
+	xgr	%r0,%r0
+	xgr	%r1,%r1
+	xgr	%r2,%r2
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	xgr	%r5,%r5
+	xgr	%r6,%r6
+	xgr	%r7,%r7
+	xgr	%r10,%r10
 	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	mvc	__PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
@@ -839,6 +866,16 @@ ENTRY(ext_int_handler)
 	lmg	%r8,%r9,__LC_EXT_OLD_PSW
 	SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER
 	stmg	%r0,%r7,__PT_R0(%r11)
+	# clear user controlled registers to prevent speculative use
+	xgr	%r0,%r0
+	xgr	%r1,%r1
+	xgr	%r2,%r2
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	xgr	%r5,%r5
+	xgr	%r6,%r6
+	xgr	%r7,%r7
+	xgr	%r10,%r10
 	mvc	__PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	lghi	%r1,__LC_EXT_PARAMS2
@@ -1046,6 +1083,16 @@ ENTRY(mcck_int_handler)
 .Lmcck_skip:
 	lghi	%r14,__LC_GPREGS_SAVE_AREA+64
 	stmg	%r0,%r7,__PT_R0(%r11)
+	# clear user controlled registers to prevent speculative use
+	xgr	%r0,%r0
+	xgr	%r1,%r1
+	xgr	%r2,%r2
+	xgr	%r3,%r3
+	xgr	%r4,%r4
+	xgr	%r5,%r5
+	xgr	%r6,%r6
+	xgr	%r7,%r7
+	xgr	%r10,%r10
 	mvc	__PT_R8(64,%r11),0(%r14)
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)

From e2dd833389cc4069a96b57bdd24227b5f52288f5 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 26 Jan 2018 12:01:55 +0100
Subject: [PATCH 300/676] s390: add optimized array_index_mask_nospec

Add an optimized version of the array_index_mask_nospec function for
s390 based on a compare and a subtract with borrow.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/barrier.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 10432607a5736..f9eddbca79d28 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -49,6 +49,30 @@ do {									\
 #define __smp_mb__before_atomic()	barrier()
 #define __smp_mb__after_atomic()	barrier()
 
+/**
+ * array_index_mask_nospec - generate a mask for array_idx() that is
+ * ~0UL when the bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ */
+#define array_index_mask_nospec array_index_mask_nospec
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+						    unsigned long size)
+{
+	unsigned long mask;
+
+	if (__builtin_constant_p(size) && size > 0) {
+		asm("	clgr	%2,%1\n"
+		    "	slbgr	%0,%0\n"
+		    :"=d" (mask) : "d" (size-1), "d" (index) :"cc");
+		return mask;
+	}
+	asm("	clgr	%1,%2\n"
+	    "	slbgr	%0,%0\n"
+	    :"=d" (mask) : "d" (size), "d" (index) :"cc");
+	return ~mask;
+}
+
 #include <asm-generic/barrier.h>
 
 #endif /* __ASM_BARRIER_H */

From 703cbaa601ff3fb554d1246c336ba727cc083ea0 Mon Sep 17 00:00:00 2001
From: Bo Yan <byan@nvidia.com>
Date: Tue, 23 Jan 2018 13:57:55 -0800
Subject: [PATCH 301/676] cpufreq: Skip cpufreq resume if it's not suspended

cpufreq_resume can be called even without preceding cpufreq_suspend.
This can happen in following scenario:

    suspend_devices_and_enter
       --> dpm_suspend_start
          --> dpm_prepare
              --> device_prepare : this function errors out
          --> dpm_suspend: this is skipped due to dpm_prepare failure
                           this means cpufreq_suspend is skipped over
       --> goto Recover_platform, due to previous error
       --> goto Resume_devices
       --> dpm_resume_end
           --> dpm_resume
               --> cpufreq_resume

In case schedutil is used as frequency governor, cpufreq_resume will
eventually call sugov_start, which does following:

    memset(sg_cpu, 0, sizeof(*sg_cpu));
    ....

This effectively erases function pointer for frequency update, causing
crash later on. The function pointer would have been set correctly if
subsequent cpufreq_add_update_util_hook runs successfully, but that
function returns earlier because cpufreq_suspend was not called:

    if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
		return;

The fix is to check cpufreq_suspended first, if it's false, that means
cpufreq_suspend was not called in the first place, so do not resume
cpufreq.

Signed-off-by: Bo Yan <byan@nvidia.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
[ rjw: Dropped printing a message ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 421f318c0e669..de33ebf008ada 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1686,6 +1686,9 @@ void cpufreq_resume(void)
 	if (!cpufreq_driver)
 		return;
 
+	if (unlikely(!cpufreq_suspended))
+		return;
+
 	cpufreq_suspended = false;
 
 	if (!has_target() && !cpufreq_driver->resume)

From cf1489984641369611556bf00c48f945c77bcf02 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 16 Jan 2018 07:03:44 +0100
Subject: [PATCH 302/676] s390/alternative: use a copy of the facility bit mask

To be able to switch off specific CPU alternatives with kernel parameters
make a copy of the facility bit mask provided by STFLE and use the copy
for the decision to apply an alternative.

Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/facility.h | 18 ++++++++++++++++++
 arch/s390/include/asm/lowcore.h  |  3 ++-
 arch/s390/kernel/alternative.c   |  3 ++-
 arch/s390/kernel/early.c         |  3 +++
 arch/s390/kernel/setup.c         |  4 +++-
 arch/s390/kernel/smp.c           |  4 +++-
 6 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index fbe0c4be3cd8f..99c8ce30b3cd1 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -15,6 +15,24 @@
 
 #define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8)
 
+static inline void __set_facility(unsigned long nr, void *facilities)
+{
+	unsigned char *ptr = (unsigned char *) facilities;
+
+	if (nr >= MAX_FACILITY_BIT)
+		return;
+	ptr[nr >> 3] |= 0x80 >> (nr & 7);
+}
+
+static inline void __clear_facility(unsigned long nr, void *facilities)
+{
+	unsigned char *ptr = (unsigned char *) facilities;
+
+	if (nr >= MAX_FACILITY_BIT)
+		return;
+	ptr[nr >> 3] &= ~(0x80 >> (nr & 7));
+}
+
 static inline int __test_facility(unsigned long nr, void *facilities)
 {
 	unsigned char *ptr;
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index ec6592e8ba36e..c63986aee9427 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -151,7 +151,8 @@ struct lowcore {
 	__u8	pad_0x0e20[0x0f00-0x0e20];	/* 0x0e20 */
 
 	/* Extended facility list */
-	__u64	stfle_fac_list[32];		/* 0x0f00 */
+	__u64	stfle_fac_list[16];		/* 0x0f00 */
+	__u64	alt_stfle_fac_list[16];		/* 0x0f80 */
 	__u8	pad_0x1000[0x11b0-0x1000];	/* 0x1000 */
 
 	/* Pointer to the machine check extended save area */
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 574e77622c049..1abf4f35d059e 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -75,7 +75,8 @@ static void __init_or_module __apply_alternatives(struct alt_instr *start,
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 
-		if (!test_facility(a->facility))
+		if (!__test_facility(a->facility,
+				     S390_lowcore.alt_stfle_fac_list))
 			continue;
 
 		if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) {
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 497a920475918..510f2183a7e73 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -193,6 +193,9 @@ static noinline __init void setup_facility_list(void)
 {
 	stfle(S390_lowcore.stfle_fac_list,
 	      ARRAY_SIZE(S390_lowcore.stfle_fac_list));
+	memcpy(S390_lowcore.alt_stfle_fac_list,
+	       S390_lowcore.stfle_fac_list,
+	       sizeof(S390_lowcore.alt_stfle_fac_list));
 }
 
 static __init void detect_diag9c(void)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 793da97f9a6e5..bcd2a4a3937e8 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -340,7 +340,9 @@ static void __init setup_lowcore(void)
 	lc->preempt_count = S390_lowcore.preempt_count;
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
-	       MAX_FACILITY_BIT/8);
+	       sizeof(lc->stfle_fac_list));
+	memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
+	       sizeof(lc->alt_stfle_fac_list));
 	nmi_alloc_boot_cpu(lc);
 	vdso_alloc_boot_cpu(lc);
 	lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a919b2f0141da..fc28c95716473 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -266,7 +266,9 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	__ctl_store(lc->cregs_save_area, 0, 15);
 	save_access_regs((unsigned int *) lc->access_regs_save_area);
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
-	       MAX_FACILITY_BIT/8);
+	       sizeof(lc->stfle_fac_list));
+	memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list,
+	       sizeof(lc->alt_stfle_fac_list));
 	arch_spin_lock_setup(cpu);
 }
 

From d768bd892fc8f066cd3aa000eb1867bcf32db0ee Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 16 Jan 2018 07:11:45 +0100
Subject: [PATCH 303/676] s390: add options to change branch prediction
 behaviour for the kernel

Add the PPA instruction to the system entry and exit path to switch
the kernel to a different branch prediction behaviour. The instructions
are added via CPU alternatives and can be disabled with the "nospec"
or the "nobp=0" kernel parameter. If the default behaviour selected
with CONFIG_KERNEL_NOBP is set to "n" then the "nobp=1" parameter can be
used to enable the changed kernel branch prediction.

Acked-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                 | 17 +++++++++++
 arch/s390/include/asm/processor.h |  1 +
 arch/s390/kernel/alternative.c    | 23 +++++++++++++++
 arch/s390/kernel/early.c          |  2 ++
 arch/s390/kernel/entry.S          | 48 +++++++++++++++++++++++++++++++
 arch/s390/kernel/ipl.c            |  1 +
 arch/s390/kernel/smp.c            |  2 ++
 7 files changed, 94 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0105ce28e246a..d514e25095c26 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -540,6 +540,23 @@ config ARCH_RANDOM
 
 	  If unsure, say Y.
 
+config KERNEL_NOBP
+	def_bool n
+	prompt "Enable modified branch prediction for the kernel by default"
+	help
+	  If this option is selected the kernel will switch to a modified
+	  branch prediction mode if the firmware interface is available.
+	  The modified branch prediction mode improves the behaviour in
+	  regard to speculative execution.
+
+	  With the option enabled the kernel parameter "nobp=0" or "nospec"
+	  can be used to run the kernel in the normal branch prediction mode.
+
+	  With the option disabled the modified branch prediction mode is
+	  enabled with the "nobp=1" kernel parameter.
+
+	  If unsure, say N.
+
 endmenu
 
 menu "Memory setup"
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index bfbfad4822897..5f37f9ceef5eb 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -91,6 +91,7 @@ void cpu_detect_mhz_feature(void);
 extern const struct seq_operations cpuinfo_op;
 extern int sysctl_ieee_emulation_warnings;
 extern void execve_tail(void);
+extern void __bpon(void);
 
 /*
  * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit.
diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c
index 1abf4f35d059e..22476135f7384 100644
--- a/arch/s390/kernel/alternative.c
+++ b/arch/s390/kernel/alternative.c
@@ -15,6 +15,29 @@ static int __init disable_alternative_instructions(char *str)
 
 early_param("noaltinstr", disable_alternative_instructions);
 
+static int __init nobp_setup_early(char *str)
+{
+	bool enabled;
+	int rc;
+
+	rc = kstrtobool(str, &enabled);
+	if (rc)
+		return rc;
+	if (enabled && test_facility(82))
+		__set_facility(82, S390_lowcore.alt_stfle_fac_list);
+	else
+		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+	return 0;
+}
+early_param("nobp", nobp_setup_early);
+
+static int __init nospec_setup_early(char *str)
+{
+	__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
+	return 0;
+}
+early_param("nospec", nospec_setup_early);
+
 struct brcl_insn {
 	u16 opc;
 	s32 disp;
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 510f2183a7e73..ac707a9f729ea 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -196,6 +196,8 @@ static noinline __init void setup_facility_list(void)
 	memcpy(S390_lowcore.alt_stfle_fac_list,
 	       S390_lowcore.stfle_fac_list,
 	       sizeof(S390_lowcore.alt_stfle_fac_list));
+	if (!IS_ENABLED(CONFIG_KERNEL_NOBP))
+		__clear_facility(82, S390_lowcore.alt_stfle_fac_list);
 }
 
 static __init void detect_diag9c(void)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 5d87eda605f22..e6d7550a3af8b 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -159,6 +159,34 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 		tm	off+\addr, \mask
 	.endm
 
+	.macro BPOFF
+	.pushsection .altinstr_replacement, "ax"
+660:	.long	0xb2e8c000
+	.popsection
+661:	.long	0x47000000
+	.pushsection .altinstructions, "a"
+	.long 661b - .
+	.long 660b - .
+	.word 82
+	.byte 4
+	.byte 4
+	.popsection
+	.endm
+
+	.macro BPON
+	.pushsection .altinstr_replacement, "ax"
+662:	.long	0xb2e8d000
+	.popsection
+663:	.long	0x47000000
+	.pushsection .altinstructions, "a"
+	.long 663b - .
+	.long 662b - .
+	.word 82
+	.byte 4
+	.byte 4
+	.popsection
+	.endm
+
 	.section .kprobes.text, "ax"
 .Ldummy:
 	/*
@@ -171,6 +199,11 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 	 */
 	nop	0
 
+ENTRY(__bpon)
+	.globl __bpon
+	BPON
+	br	%r14
+
 /*
  * Scheduler resume function, called by switch_to
  *  gpr2 = (task_struct *) prev
@@ -226,8 +259,11 @@ ENTRY(sie64a)
 	jnz	.Lsie_skip
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsie_skip			# exit if fp/vx regs changed
+	BPON
 .Lsie_entry:
 	sie	0(%r14)
+.Lsie_exit:
+	BPOFF
 .Lsie_skip:
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
@@ -279,6 +315,7 @@ ENTRY(system_call)
 	stpt	__LC_SYNC_ENTER_TIMER
 .Lsysc_stmg:
 	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
+	BPOFF
 	lg	%r12,__LC_CURRENT
 	lghi	%r13,__TASK_thread
 	lghi	%r14,_PIF_SYSCALL
@@ -325,6 +362,7 @@ ENTRY(system_call)
 	jnz	.Lsysc_work			# check for work
 	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
 	jnz	.Lsysc_work
+	BPON
 .Lsysc_restore:
 	lg	%r14,__LC_VDSO_PER_CPU
 	lmg	%r0,%r10,__PT_R0(%r11)
@@ -530,6 +568,7 @@ ENTRY(kernel_thread_starter)
 
 ENTRY(pgm_check_handler)
 	stpt	__LC_SYNC_ENTER_TIMER
+	BPOFF
 	stmg	%r8,%r15,__LC_SAVE_AREA_SYNC
 	lg	%r10,__LC_LAST_BREAK
 	lg	%r12,__LC_CURRENT
@@ -637,6 +676,7 @@ ENTRY(pgm_check_handler)
 ENTRY(io_int_handler)
 	STCK	__LC_INT_CLOCK
 	stpt	__LC_ASYNC_ENTER_TIMER
+	BPOFF
 	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
 	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
@@ -687,9 +727,13 @@ ENTRY(io_int_handler)
 	lg	%r14,__LC_VDSO_PER_CPU
 	lmg	%r0,%r10,__PT_R0(%r11)
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
+	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
+	jno	.Lio_exit_kernel
+	BPON
 .Lio_exit_timer:
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+.Lio_exit_kernel:
 	lmg	%r11,%r15,__PT_R11(%r11)
 	lpswe	__LC_RETURN_PSW
 .Lio_done:
@@ -860,6 +904,7 @@ ENTRY(io_int_handler)
 ENTRY(ext_int_handler)
 	STCK	__LC_INT_CLOCK
 	stpt	__LC_ASYNC_ENTER_TIMER
+	BPOFF
 	stmg	%r8,%r15,__LC_SAVE_AREA_ASYNC
 	lg	%r12,__LC_CURRENT
 	larl	%r13,cleanup_critical
@@ -908,6 +953,7 @@ ENTRY(psw_idle)
 .Lpsw_idle_stcctm:
 #endif
 	oi	__LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT
+	BPON
 	STCK	__CLOCK_IDLE_ENTER(%r2)
 	stpt	__TIMER_IDLE_ENTER(%r2)
 .Lpsw_idle_lpsw:
@@ -1008,6 +1054,7 @@ load_fpu_regs:
  */
 ENTRY(mcck_int_handler)
 	STCK	__LC_MCCK_CLOCK
+	BPOFF
 	la	%r1,4095		# validate r1
 	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# validate cpu timer
 	sckc	__LC_CLOCK_COMPARATOR			# validate comparator
@@ -1118,6 +1165,7 @@ ENTRY(mcck_int_handler)
 	mvc	__LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
 	tm	__LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
 	jno	0f
+	BPON
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
 0:	lmg	%r11,%r15,__PT_R11(%r11)
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index da5cc3b469aa1..34477c1aee6df 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -543,6 +543,7 @@ static struct kset *ipl_kset;
 
 static void __ipl_run(void *unused)
 {
+	__bpon();
 	diag308(DIAG308_LOAD_CLEAR, NULL);
 	if (MACHINE_IS_VM)
 		__cpcmd("IPL", NULL, 0, NULL);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index fc28c95716473..2fd7d609dae08 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -319,6 +319,7 @@ static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *),
 	mem_assign_absolute(lc->restart_fn, (unsigned long) func);
 	mem_assign_absolute(lc->restart_data, (unsigned long) data);
 	mem_assign_absolute(lc->restart_source, source_cpu);
+	__bpon();
 	asm volatile(
 		"0:	sigp	0,%0,%2	# sigp restart to target cpu\n"
 		"	brc	2,0b	# busy, try again\n"
@@ -903,6 +904,7 @@ void __cpu_die(unsigned int cpu)
 void __noreturn cpu_die(void)
 {
 	idle_task_exit();
+	__bpon();
 	pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0);
 	for (;;) ;
 }

From 6b73044b2b0081ee3dd1cd6eaab7dee552601efb Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 16 Jan 2018 07:36:46 +0100
Subject: [PATCH 304/676] s390: run user space and KVM guests with modified
 branch prediction

Define TIF_ISOLATE_BP and TIF_ISOLATE_BP_GUEST and add the necessary
plumbing in entry.S to be able to run user space and KVM guests with
limited branch prediction.

To switch a user space process to limited branch prediction the
s390_isolate_bp() function has to be call, and to run a vCPU of a KVM
guest associated with the current task with limited branch prediction
call s390_isolate_bp_guest().

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/processor.h   |  3 ++
 arch/s390/include/asm/thread_info.h |  4 +++
 arch/s390/kernel/entry.S            | 51 ++++++++++++++++++++++++++---
 arch/s390/kernel/processor.c        | 18 ++++++++++
 4 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 5f37f9ceef5eb..7f2953c15c37b 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -378,6 +378,9 @@ extern void memcpy_absolute(void *, void *, size_t);
 	memcpy_absolute(&(dest), &__tmp, sizeof(__tmp));	\
 } while (0)
 
+extern int s390_isolate_bp(void);
+extern int s390_isolate_bp_guest(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_S390_PROCESSOR_H */
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 25d6ec3aaddda..83ba57533ce6f 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -58,6 +58,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define TIF_GUARDED_STORAGE	4	/* load guarded storage control block */
 #define TIF_PATCH_PENDING	5	/* pending live patching update */
 #define TIF_PGSTE		6	/* New mm's will use 4K page tables */
+#define TIF_ISOLATE_BP		8	/* Run process with isolated BP */
+#define TIF_ISOLATE_BP_GUEST	9	/* Run KVM guests with isolated BP */
 
 #define TIF_31BIT		16	/* 32bit process */
 #define TIF_MEMDIE		17	/* is terminating due to OOM killer */
@@ -78,6 +80,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define _TIF_UPROBE		_BITUL(TIF_UPROBE)
 #define _TIF_GUARDED_STORAGE	_BITUL(TIF_GUARDED_STORAGE)
 #define _TIF_PATCH_PENDING	_BITUL(TIF_PATCH_PENDING)
+#define _TIF_ISOLATE_BP		_BITUL(TIF_ISOLATE_BP)
+#define _TIF_ISOLATE_BP_GUEST	_BITUL(TIF_ISOLATE_BP_GUEST)
 
 #define _TIF_31BIT		_BITUL(TIF_31BIT)
 #define _TIF_SINGLE_STEP	_BITUL(TIF_SINGLE_STEP)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index e6d7550a3af8b..53145b5d987de 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -107,6 +107,7 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	j	3f
 1:	UPDATE_VTIME %r14,%r15,\timer
+	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
 2:	lg	%r15,__LC_ASYNC_STACK	# load async stack
 3:	la	%r11,STACK_FRAME_OVERHEAD(%r15)
 	.endm
@@ -187,6 +188,40 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 	.popsection
 	.endm
 
+	.macro BPENTER tif_ptr,tif_mask
+	.pushsection .altinstr_replacement, "ax"
+662:	.word	0xc004, 0x0000, 0x0000	# 6 byte nop
+	.word	0xc004, 0x0000, 0x0000	# 6 byte nop
+	.popsection
+664:	TSTMSK	\tif_ptr,\tif_mask
+	jz	. + 8
+	.long	0xb2e8d000
+	.pushsection .altinstructions, "a"
+	.long 664b - .
+	.long 662b - .
+	.word 82
+	.byte 12
+	.byte 12
+	.popsection
+	.endm
+
+	.macro BPEXIT tif_ptr,tif_mask
+	TSTMSK	\tif_ptr,\tif_mask
+	.pushsection .altinstr_replacement, "ax"
+662:	jnz	. + 8
+	.long	0xb2e8d000
+	.popsection
+664:	jz	. + 8
+	.long	0xb2e8c000
+	.pushsection .altinstructions, "a"
+	.long 664b - .
+	.long 662b - .
+	.word 82
+	.byte 8
+	.byte 8
+	.popsection
+	.endm
+
 	.section .kprobes.text, "ax"
 .Ldummy:
 	/*
@@ -240,9 +275,11 @@ ENTRY(__switch_to)
  */
 ENTRY(sie64a)
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
+	lg	%r12,__LC_CURRENT
 	stg	%r2,__SF_EMPTY(%r15)		# save control block pointer
 	stg	%r3,__SF_EMPTY+8(%r15)		# save guest register save area
 	xc	__SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # reason code = 0
+	mvc	__SF_EMPTY+24(8,%r15),__TI_flags(%r12) # copy thread flags
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU		# load guest fp/vx registers ?
 	jno	.Lsie_load_guest_gprs
 	brasl	%r14,load_fpu_regs		# load guest fp/vx regs
@@ -259,11 +296,12 @@ ENTRY(sie64a)
 	jnz	.Lsie_skip
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsie_skip			# exit if fp/vx regs changed
-	BPON
+	BPEXIT	__SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
 .Lsie_entry:
 	sie	0(%r14)
 .Lsie_exit:
 	BPOFF
+	BPENTER	__SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
 .Lsie_skip:
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
@@ -324,6 +362,7 @@ ENTRY(system_call)
 	la	%r11,STACK_FRAME_OVERHEAD(%r15)	# pointer to pt_regs
 .Lsysc_vtime:
 	UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER
+	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
 	stmg	%r0,%r7,__PT_R0(%r11)
 	# clear user controlled register to prevent speculative use
 	xgr	%r0,%r0
@@ -362,7 +401,7 @@ ENTRY(system_call)
 	jnz	.Lsysc_work			# check for work
 	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
 	jnz	.Lsysc_work
-	BPON
+	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
 .Lsysc_restore:
 	lg	%r14,__LC_VDSO_PER_CPU
 	lmg	%r0,%r10,__PT_R0(%r11)
@@ -597,6 +636,7 @@ ENTRY(pgm_check_handler)
 	aghi	%r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE)
 	j	4f
 2:	UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER
+	BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP
 	lg	%r15,__LC_KERNEL_STACK
 	lgr	%r14,%r12
 	aghi	%r14,__TASK_thread	# pointer to thread_struct
@@ -729,7 +769,7 @@ ENTRY(io_int_handler)
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
 	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
 	jno	.Lio_exit_kernel
-	BPON
+	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
 .Lio_exit_timer:
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
@@ -1165,7 +1205,7 @@ ENTRY(mcck_int_handler)
 	mvc	__LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW
 	tm	__LC_RETURN_MCCK_PSW+1,0x01 # returning to user ?
 	jno	0f
-	BPON
+	BPEXIT	__TI_flags(%r12),_TIF_ISOLATE_BP
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
 0:	lmg	%r11,%r15,__PT_R11(%r11)
@@ -1292,7 +1332,8 @@ cleanup_critical:
 	clg     %r9,BASED(.Lsie_crit_mcck_length)
 	jh      1f
 	oi      __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST
-1:	lg	%r9,__SF_EMPTY(%r15)		# get control block pointer
+1:	BPENTER __SF_EMPTY+24(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST)
+	lg	%r9,__SF_EMPTY(%r15)		# get control block pointer
 	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
 	larl	%r9,sie_exit			# skip forward to sie_exit
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index 5362fd868d0d4..6fe2e1875058b 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -197,3 +197,21 @@ const struct seq_operations cpuinfo_op = {
 	.stop	= c_stop,
 	.show	= show_cpuinfo,
 };
+
+int s390_isolate_bp(void)
+{
+	if (!test_facility(82))
+		return -EOPNOTSUPP;
+	set_thread_flag(TIF_ISOLATE_BP);
+	return 0;
+}
+EXPORT_SYMBOL(s390_isolate_bp);
+
+int s390_isolate_bp_guest(void)
+{
+	if (!test_facility(82))
+		return -EOPNOTSUPP;
+	set_thread_flag(TIF_ISOLATE_BP_GUEST);
+	return 0;
+}
+EXPORT_SYMBOL(s390_isolate_bp_guest);

From fd649f10c3d21ee9d7542c609f29978bdf73ab94 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 30 Jan 2018 16:07:37 +0200
Subject: [PATCH 305/676] btrfs: Fix use-after-free when cleaning up fs_devs
 with a single stale device

Commit 4fde46f0cc71 ("Btrfs: free the stale device") introduced
btrfs_free_stale_device which iterates the device lists for all
registered btrfs filesystems and deletes those devices which aren't
mounted. In a btrfs_devices structure has only 1 device attached to it
and it is unused then btrfs_free_stale_devices will proceed to also free
the btrfs_fs_devices struct itself. Currently this leads to a use after
free since list_for_each_entry will try to perform a check on the
already freed memory to see if it has to terminate the loop.

The fix is to use 'break' when we know we are freeing the current
fs_devs.

Fixes: 4fde46f0cc71 ("Btrfs: free the stale device")
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5036bd69e6a6..2ceb924ca0d63 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -645,6 +645,7 @@ static void btrfs_free_stale_devices(const char *path,
 				btrfs_sysfs_remove_fsid(fs_devs);
 				list_del(&fs_devs->list);
 				free_fs_devices(fs_devs);
+				break;
 			} else {
 				fs_devs->num_devices--;
 				list_del(&dev->dev_list);

From 9b30889c548a4d45bfe6226e58de32504c1d682f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 5 Feb 2018 10:20:06 -0500
Subject: [PATCH 306/676] SUNRPC: Ensure we always close the socket after a
 connection shuts down

Ensure that we release the TCP socket once it is in the TCP_CLOSE or
TCP_TIME_WAIT state (and only then) so that we don't confuse rkhunter
and its ilk.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 18803021f242e..5d0108172ed38 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -807,13 +807,6 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
 	smp_mb__after_atomic();
 }
 
-static void xs_sock_mark_closed(struct rpc_xprt *xprt)
-{
-	xs_sock_reset_connection_flags(xprt);
-	/* Mark transport as closed and wake up all pending tasks */
-	xprt_disconnect_done(xprt);
-}
-
 /**
  * xs_error_report - callback to handle TCP socket state errors
  * @sk: socket
@@ -833,9 +826,6 @@ static void xs_error_report(struct sock *sk)
 	err = -sk->sk_err;
 	if (err == 0)
 		goto out;
-	/* Is this a reset event? */
-	if (sk->sk_state == TCP_CLOSE)
-		xs_sock_mark_closed(xprt);
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
@@ -1655,9 +1645,11 @@ static void xs_tcp_state_change(struct sock *sk)
 		if (test_and_clear_bit(XPRT_SOCK_CONNECTING,
 					&transport->sock_state))
 			xprt_clear_connecting(xprt);
+		clear_bit(XPRT_CLOSING, &xprt->state);
 		if (sk->sk_err)
 			xprt_wake_pending_tasks(xprt, -sk->sk_err);
-		xs_sock_mark_closed(xprt);
+		/* Trigger the socket release */
+		xs_tcp_force_close(xprt);
 	}
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -2265,14 +2257,19 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 	struct socket *sock = transport->sock;
+	int skst = transport->inet ? transport->inet->sk_state : TCP_CLOSE;
 
 	if (sock == NULL)
 		return;
-	if (xprt_connected(xprt)) {
+	switch (skst) {
+	default:
 		kernel_sock_shutdown(sock, SHUT_RDWR);
 		trace_rpc_socket_shutdown(xprt, sock);
-	} else
+		break;
+	case TCP_CLOSE:
+	case TCP_TIME_WAIT:
 		xs_reset_transport(transport);
+	}
 }
 
 static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,

From 8e1eb3fa009aa7c0b944b3c8b26b07de0efb3200 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 5 Feb 2018 17:18:05 -0800
Subject: [PATCH 307/676] x86/entry/64: Clear extra registers beyond syscall
 arguments, to reduce speculation attack surface

At entry userspace may have (maliciously) populated the extra registers
outside the syscall calling convention with arbitrary values that could
be useful in a speculative execution (Spectre style) attack.

Clear these registers to minimize the kernel's attack surface.

Note, this only clears the extra registers and not the unused
registers for syscalls less than 6 arguments, since those registers are
likely to be clobbered well before their values could be put to use
under speculation.

Note, Linus found that the XOR instructions can be executed with
minimized cost if interleaved with the PUSH instructions, and Ingo's
analysis found that R10 and R11 should be included in the register
clearing beyond the typical 'extra' syscall calling convention
registers.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/151787988577.7847.16733592218894189003.stgit@dwillia2-desk3.amr.corp.intel.com
[ Made small improvements to the changelog and the code comments. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c752abe89d807..065a71b908080 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -235,13 +235,26 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%r8				/* pt_regs->r8 */
 	pushq	%r9				/* pt_regs->r9 */
 	pushq	%r10				/* pt_regs->r10 */
+	/*
+	 * Clear extra registers that a speculation attack might
+	 * otherwise want to exploit. Interleave XOR with PUSH
+	 * for better uop scheduling:
+	 */
+	xorq	%r10, %r10			/* nospec   r10 */
 	pushq	%r11				/* pt_regs->r11 */
+	xorq	%r11, %r11			/* nospec   r11 */
 	pushq	%rbx				/* pt_regs->rbx */
+	xorl	%ebx, %ebx			/* nospec   rbx */
 	pushq	%rbp				/* pt_regs->rbp */
+	xorl	%ebp, %ebp			/* nospec   rbp */
 	pushq	%r12				/* pt_regs->r12 */
+	xorq	%r12, %r12			/* nospec   r12 */
 	pushq	%r13				/* pt_regs->r13 */
+	xorq	%r13, %r13			/* nospec   r13 */
 	pushq	%r14				/* pt_regs->r14 */
+	xorq	%r14, %r14			/* nospec   r14 */
 	pushq	%r15				/* pt_regs->r15 */
+	xorq	%r15, %r15			/* nospec   r15 */
 	UNWIND_HINT_REGS
 
 	TRACE_IRQS_OFF

From 4f277295e54c5b7340e48efea3fc5cc21a2872b7 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Thu, 1 Feb 2018 13:40:19 +0100
Subject: [PATCH 308/676] x86/xen: init %gs very early to avoid page faults
 with stack protector

When running as Xen pv guest %gs is initialized some time after
C code is started. Depending on stack protector usage this might be
too late, resulting in page faults.

So setup %gs and MSR_GS_BASE in assembly code already.

Cc: stable@vger.kernel.org
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Tested-by: Chris Patterson <cjp256@gmail.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/xen-head.S | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 497cc55a0c16c..96f26e026783d 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -9,7 +9,9 @@
 
 #include <asm/boot.h>
 #include <asm/asm.h>
+#include <asm/msr.h>
 #include <asm/page_types.h>
+#include <asm/percpu.h>
 #include <asm/unwind_hints.h>
 
 #include <xen/interface/elfnote.h>
@@ -35,6 +37,20 @@ ENTRY(startup_xen)
 	mov %_ASM_SI, xen_start_info
 	mov $init_thread_union+THREAD_SIZE, %_ASM_SP
 
+#ifdef CONFIG_X86_64
+	/* Set up %gs.
+	 *
+	 * The base of %gs always points to the bottom of the irqstack
+	 * union.  If the stack protector canary is enabled, it is
+	 * located at %gs:40.  Note that, on SMP, the boot cpu uses
+	 * init data section till per cpu areas are set up.
+	 */
+	movl	$MSR_GS_BASE,%ecx
+	movq	$INIT_PER_CPU_VAR(irq_stack_union),%rax
+	cdq
+	wrmsr
+#endif
+
 	jmp xen_start_kernel
 END(startup_xen)
 	__FINIT

From 3ac7292a25db1c607a50752055a18aba32ac2176 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Thu, 11 Jan 2018 09:36:37 +0000
Subject: [PATCH 309/676] xen/grant-table: Use put_page instead of free_page

The page given to gnttab_end_foreign_access() to free could be a
compound page so use put_page() instead of free_page() since it can
handle both compound and single pages correctly.

This bug was discovered when migrating a Xen VM with several VIFs and
CONFIG_DEBUG_VM enabled. It hits a BUG usually after fewer than 10
iterations. All netfront devices disconnect from the backend during a
suspend/resume and this will call gnttab_end_foreign_access() if a
netfront queue has an outstanding skb. The mismatch between calling
get_page() and free_page() on a compound page causes a reference
counting error which is detected when DEBUG_VM is enabled.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/grant-table.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index f45114fd8e1e7..27be107d64802 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -382,7 +382,7 @@ static void gnttab_handle_deferred(struct timer_list *unused)
 			if (entry->page) {
 				pr_debug("freeing g.e. %#x (pfn %#lx)\n",
 					 entry->ref, page_to_pfn(entry->page));
-				__free_page(entry->page);
+				put_page(entry->page);
 			} else
 				pr_info("freeing g.e. %#x\n", entry->ref);
 			kfree(entry);
@@ -438,7 +438,7 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
 	if (gnttab_end_foreign_access_ref(ref, readonly)) {
 		put_free_entry(ref);
 		if (page != 0)
-			free_page(page);
+			put_page(virt_to_page(page));
 	} else
 		gnttab_add_deferred(ref, readonly,
 				    page ? virt_to_page(page) : NULL);

From f599c64fdf7d9c108e8717fb04bc41c680120da4 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Thu, 11 Jan 2018 09:36:38 +0000
Subject: [PATCH 310/676] xen-netfront: Fix race between device setup and open

When a netfront device is set up it registers a netdev fairly early on,
before it has set up the queues and is actually usable. A userspace tool
like NetworkManager will immediately try to open it and access its state
as soon as it appears. The bug can be reproduced by hotplugging VIFs
until the VM runs out of grant refs. It registers the netdev but fails
to set up any queues (since there are no more grant refs). In the
meantime, NetworkManager opens the device and the kernel crashes trying
to access the queues (of which there are none).

Fix this in two ways:
* For initial setup, register the netdev much later, after the queues
are setup. This avoids the race entirely.
* During a suspend/resume cycle, the frontend reconnects to the backend
and the queues are recreated. It is possible (though highly unlikely) to
race with something opening the device and accessing the queues after
they have been destroyed but before they have been recreated. Extend the
region covered by the rtnl semaphore to protect against this race. There
is a possibility that we fail to recreate the queues so check for this
in the open function.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/net/xen-netfront.c | 46 ++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 9bd7ddeeb6a5c..8328d395e3329 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -351,6 +351,9 @@ static int xennet_open(struct net_device *dev)
 	unsigned int i = 0;
 	struct netfront_queue *queue = NULL;
 
+	if (!np->queues)
+		return -ENODEV;
+
 	for (i = 0; i < num_queues; ++i) {
 		queue = &np->queues[i];
 		napi_enable(&queue->napi);
@@ -1358,18 +1361,8 @@ static int netfront_probe(struct xenbus_device *dev,
 #ifdef CONFIG_SYSFS
 	info->netdev->sysfs_groups[0] = &xennet_dev_group;
 #endif
-	err = register_netdev(info->netdev);
-	if (err) {
-		pr_warn("%s: register_netdev err=%d\n", __func__, err);
-		goto fail;
-	}
 
 	return 0;
-
- fail:
-	xennet_free_netdev(netdev);
-	dev_set_drvdata(&dev->dev, NULL);
-	return err;
 }
 
 static void xennet_end_access(int ref, void *page)
@@ -1737,8 +1730,6 @@ static void xennet_destroy_queues(struct netfront_info *info)
 {
 	unsigned int i;
 
-	rtnl_lock();
-
 	for (i = 0; i < info->netdev->real_num_tx_queues; i++) {
 		struct netfront_queue *queue = &info->queues[i];
 
@@ -1747,8 +1738,6 @@ static void xennet_destroy_queues(struct netfront_info *info)
 		netif_napi_del(&queue->napi);
 	}
 
-	rtnl_unlock();
-
 	kfree(info->queues);
 	info->queues = NULL;
 }
@@ -1764,8 +1753,6 @@ static int xennet_create_queues(struct netfront_info *info,
 	if (!info->queues)
 		return -ENOMEM;
 
-	rtnl_lock();
-
 	for (i = 0; i < *num_queues; i++) {
 		struct netfront_queue *queue = &info->queues[i];
 
@@ -1774,7 +1761,7 @@ static int xennet_create_queues(struct netfront_info *info,
 
 		ret = xennet_init_queue(queue);
 		if (ret < 0) {
-			dev_warn(&info->netdev->dev,
+			dev_warn(&info->xbdev->dev,
 				 "only created %d queues\n", i);
 			*num_queues = i;
 			break;
@@ -1788,10 +1775,8 @@ static int xennet_create_queues(struct netfront_info *info,
 
 	netif_set_real_num_tx_queues(info->netdev, *num_queues);
 
-	rtnl_unlock();
-
 	if (*num_queues == 0) {
-		dev_err(&info->netdev->dev, "no queues\n");
+		dev_err(&info->xbdev->dev, "no queues\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -1828,6 +1813,7 @@ static int talk_to_netback(struct xenbus_device *dev,
 		goto out;
 	}
 
+	rtnl_lock();
 	if (info->queues)
 		xennet_destroy_queues(info);
 
@@ -1838,6 +1824,7 @@ static int talk_to_netback(struct xenbus_device *dev,
 		info->queues = NULL;
 		goto out;
 	}
+	rtnl_unlock();
 
 	/* Create shared ring, alloc event channel -- for each queue */
 	for (i = 0; i < num_queues; ++i) {
@@ -1934,8 +1921,10 @@ static int talk_to_netback(struct xenbus_device *dev,
 	xenbus_transaction_end(xbt, 1);
  destroy_ring:
 	xennet_disconnect_backend(info);
+	rtnl_lock();
 	xennet_destroy_queues(info);
  out:
+	rtnl_unlock();
 	device_unregister(&dev->dev);
 	return err;
 }
@@ -1965,6 +1954,15 @@ static int xennet_connect(struct net_device *dev)
 	netdev_update_features(dev);
 	rtnl_unlock();
 
+	if (dev->reg_state == NETREG_UNINITIALIZED) {
+		err = register_netdev(dev);
+		if (err) {
+			pr_warn("%s: register_netdev err=%d\n", __func__, err);
+			device_unregister(&np->xbdev->dev);
+			return err;
+		}
+	}
+
 	/*
 	 * All public and private state should now be sane.  Get
 	 * ready to start sending and receiving packets and give the driver
@@ -2150,10 +2148,14 @@ static int xennet_remove(struct xenbus_device *dev)
 
 	xennet_disconnect_backend(info);
 
-	unregister_netdev(info->netdev);
+	if (info->netdev->reg_state == NETREG_REGISTERED)
+		unregister_netdev(info->netdev);
 
-	if (info->queues)
+	if (info->queues) {
+		rtnl_lock();
 		xennet_destroy_queues(info);
+		rtnl_unlock();
+	}
 	xennet_free_netdev(info->netdev);
 
 	return 0;

From 5355ccbe02da413df22eb05f89ca2da9959f9147 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 15 Jan 2018 17:21:48 +0100
Subject: [PATCH 311/676] x86/cpufeature: Reindent _static_cpu_has()

Because its daft..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 78 +++++++++++++++----------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 70eddb3922ff7..910a30699ffb8 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -148,45 +148,45 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-		asm_volatile_goto("1: jmp 6f\n"
-			 "2:\n"
-			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
-			         "((5f-4f) - (2b-1b)),0x90\n"
-			 "3:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 4f - .\n"		/* repl offset */
-			 " .word %P1\n"			/* always replace */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 5f - 4f\n"		/* repl len */
-			 " .byte 3b - 2b\n"		/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: jmp %l[t_no]\n"
-			 "5:\n"
-			 ".previous\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"		/* src offset */
-			 " .long 0\n"			/* no replacement */
-			 " .word %P0\n"			/* feature bit */
-			 " .byte 3b - 1b\n"		/* src len */
-			 " .byte 0\n"			/* repl len */
-			 " .byte 0\n"			/* pad len */
-			 ".previous\n"
-			 ".section .altinstr_aux,\"ax\"\n"
-			 "6:\n"
-			 " testb %[bitnum],%[cap_byte]\n"
-			 " jnz %l[t_yes]\n"
-			 " jmp %l[t_no]\n"
-			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-			     [bitnum] "i" (1 << (bit & 7)),
-			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
-			 : : t_yes, t_no);
-	t_yes:
-		return true;
-	t_no:
-		return false;
+	asm_volatile_goto("1: jmp 6f\n"
+		 "2:\n"
+		 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			 "((5f-4f) - (2b-1b)),0x90\n"
+		 "3:\n"
+		 ".section .altinstructions,\"a\"\n"
+		 " .long 1b - .\n"		/* src offset */
+		 " .long 4f - .\n"		/* repl offset */
+		 " .word %P1\n"			/* always replace */
+		 " .byte 3b - 1b\n"		/* src len */
+		 " .byte 5f - 4f\n"		/* repl len */
+		 " .byte 3b - 2b\n"		/* pad len */
+		 ".previous\n"
+		 ".section .altinstr_replacement,\"ax\"\n"
+		 "4: jmp %l[t_no]\n"
+		 "5:\n"
+		 ".previous\n"
+		 ".section .altinstructions,\"a\"\n"
+		 " .long 1b - .\n"		/* src offset */
+		 " .long 0\n"			/* no replacement */
+		 " .word %P0\n"			/* feature bit */
+		 " .byte 3b - 1b\n"		/* src len */
+		 " .byte 0\n"			/* repl len */
+		 " .byte 0\n"			/* pad len */
+		 ".previous\n"
+		 ".section .altinstr_aux,\"ax\"\n"
+		 "6:\n"
+		 " testb %[bitnum],%[cap_byte]\n"
+		 " jnz %l[t_yes]\n"
+		 " jmp %l[t_no]\n"
+		 ".previous\n"
+		 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+		     [bitnum] "i" (1 << (bit & 7)),
+		     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+		 : : t_yes, t_no);
+t_yes:
+	return true;
+t_no:
+	return false;
 }
 
 #define static_cpu_has(bit)					\

From 3197b04bb39b596613ff2f8143c5cd0a6908debf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 16 Jan 2018 09:34:01 +0100
Subject: [PATCH 312/676] x86/cpufeature: Update _static_cpu_has() to use all
 named variables

Because more readable..

Requested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 910a30699ffb8..736771c9822ef 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -156,7 +156,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		 ".section .altinstructions,\"a\"\n"
 		 " .long 1b - .\n"		/* src offset */
 		 " .long 4f - .\n"		/* repl offset */
-		 " .word %P1\n"			/* always replace */
+		 " .word %P[always]\n"		/* always replace */
 		 " .byte 3b - 1b\n"		/* src len */
 		 " .byte 5f - 4f\n"		/* repl len */
 		 " .byte 3b - 2b\n"		/* pad len */
@@ -168,7 +168,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		 ".section .altinstructions,\"a\"\n"
 		 " .long 1b - .\n"		/* src offset */
 		 " .long 0\n"			/* no replacement */
-		 " .word %P0\n"			/* feature bit */
+		 " .word %P[feature]\n"		/* feature bit */
 		 " .byte 3b - 1b\n"		/* src len */
 		 " .byte 0\n"			/* repl len */
 		 " .byte 0\n"			/* pad len */
@@ -179,8 +179,9 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		 " jnz %l[t_yes]\n"
 		 " jmp %l[t_no]\n"
 		 ".previous\n"
-		 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-		     [bitnum] "i" (1 << (bit & 7)),
+		 : : [feature]  "i" (bit),
+		     [always]   "i" (X86_FEATURE_ALWAYS),
+		     [bitnum]   "i" (1 << (bit & 7)),
 		     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
 		 : : t_yes, t_no);
 t_yes:

From da6f8320d58623eae9b6fa2f09b1b4f60a772ce9 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Thu, 4 Jan 2018 10:06:38 -0800
Subject: [PATCH 313/676] ACPICA: All acpica: Update copyrights to 2018

including tool signons.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpica/acapps.h                             | 4 ++--
 drivers/acpi/acpica/accommon.h                           | 2 +-
 drivers/acpi/acpica/acconvert.h                          | 2 +-
 drivers/acpi/acpica/acdebug.h                            | 2 +-
 drivers/acpi/acpica/acdispat.h                           | 2 +-
 drivers/acpi/acpica/acevents.h                           | 2 +-
 drivers/acpi/acpica/acglobal.h                           | 2 +-
 drivers/acpi/acpica/achware.h                            | 2 +-
 drivers/acpi/acpica/acinterp.h                           | 2 +-
 drivers/acpi/acpica/aclocal.h                            | 2 +-
 drivers/acpi/acpica/acmacros.h                           | 2 +-
 drivers/acpi/acpica/acnamesp.h                           | 2 +-
 drivers/acpi/acpica/acobject.h                           | 2 +-
 drivers/acpi/acpica/acopcode.h                           | 2 +-
 drivers/acpi/acpica/acparser.h                           | 2 +-
 drivers/acpi/acpica/acpredef.h                           | 2 +-
 drivers/acpi/acpica/acresrc.h                            | 2 +-
 drivers/acpi/acpica/acstruct.h                           | 2 +-
 drivers/acpi/acpica/actables.h                           | 2 +-
 drivers/acpi/acpica/acutils.h                            | 2 +-
 drivers/acpi/acpica/amlcode.h                            | 2 +-
 drivers/acpi/acpica/amlresrc.h                           | 2 +-
 drivers/acpi/acpica/dbcmds.c                             | 2 +-
 drivers/acpi/acpica/dbconvert.c                          | 2 +-
 drivers/acpi/acpica/dbdisply.c                           | 2 +-
 drivers/acpi/acpica/dbexec.c                             | 2 +-
 drivers/acpi/acpica/dbfileio.c                           | 2 +-
 drivers/acpi/acpica/dbhistry.c                           | 2 +-
 drivers/acpi/acpica/dbinput.c                            | 2 +-
 drivers/acpi/acpica/dbmethod.c                           | 2 +-
 drivers/acpi/acpica/dbnames.c                            | 2 +-
 drivers/acpi/acpica/dbobject.c                           | 2 +-
 drivers/acpi/acpica/dbstats.c                            | 2 +-
 drivers/acpi/acpica/dbtest.c                             | 2 +-
 drivers/acpi/acpica/dbutils.c                            | 2 +-
 drivers/acpi/acpica/dbxface.c                            | 2 +-
 drivers/acpi/acpica/dsargs.c                             | 2 +-
 drivers/acpi/acpica/dscontrol.c                          | 2 +-
 drivers/acpi/acpica/dsdebug.c                            | 2 +-
 drivers/acpi/acpica/dsfield.c                            | 2 +-
 drivers/acpi/acpica/dsinit.c                             | 2 +-
 drivers/acpi/acpica/dsmethod.c                           | 2 +-
 drivers/acpi/acpica/dsmthdat.c                           | 2 +-
 drivers/acpi/acpica/dsobject.c                           | 2 +-
 drivers/acpi/acpica/dsopcode.c                           | 2 +-
 drivers/acpi/acpica/dspkginit.c                          | 2 +-
 drivers/acpi/acpica/dsutils.c                            | 2 +-
 drivers/acpi/acpica/dswexec.c                            | 2 +-
 drivers/acpi/acpica/dswload.c                            | 2 +-
 drivers/acpi/acpica/dswload2.c                           | 2 +-
 drivers/acpi/acpica/dswscope.c                           | 2 +-
 drivers/acpi/acpica/dswstate.c                           | 2 +-
 drivers/acpi/acpica/evevent.c                            | 2 +-
 drivers/acpi/acpica/evglock.c                            | 2 +-
 drivers/acpi/acpica/evgpe.c                              | 2 +-
 drivers/acpi/acpica/evgpeblk.c                           | 2 +-
 drivers/acpi/acpica/evgpeinit.c                          | 2 +-
 drivers/acpi/acpica/evgpeutil.c                          | 2 +-
 drivers/acpi/acpica/evhandler.c                          | 2 +-
 drivers/acpi/acpica/evmisc.c                             | 2 +-
 drivers/acpi/acpica/evregion.c                           | 2 +-
 drivers/acpi/acpica/evrgnini.c                           | 2 +-
 drivers/acpi/acpica/evsci.c                              | 2 +-
 drivers/acpi/acpica/evxface.c                            | 2 +-
 drivers/acpi/acpica/evxfevnt.c                           | 2 +-
 drivers/acpi/acpica/evxfgpe.c                            | 2 +-
 drivers/acpi/acpica/evxfregn.c                           | 2 +-
 drivers/acpi/acpica/exconcat.c                           | 2 +-
 drivers/acpi/acpica/exconfig.c                           | 2 +-
 drivers/acpi/acpica/exconvrt.c                           | 2 +-
 drivers/acpi/acpica/excreate.c                           | 2 +-
 drivers/acpi/acpica/exdebug.c                            | 2 +-
 drivers/acpi/acpica/exdump.c                             | 2 +-
 drivers/acpi/acpica/exfield.c                            | 2 +-
 drivers/acpi/acpica/exfldio.c                            | 2 +-
 drivers/acpi/acpica/exmisc.c                             | 2 +-
 drivers/acpi/acpica/exmutex.c                            | 2 +-
 drivers/acpi/acpica/exnames.c                            | 2 +-
 drivers/acpi/acpica/exoparg1.c                           | 2 +-
 drivers/acpi/acpica/exoparg2.c                           | 2 +-
 drivers/acpi/acpica/exoparg3.c                           | 2 +-
 drivers/acpi/acpica/exoparg6.c                           | 2 +-
 drivers/acpi/acpica/exprep.c                             | 2 +-
 drivers/acpi/acpica/exregion.c                           | 2 +-
 drivers/acpi/acpica/exresnte.c                           | 2 +-
 drivers/acpi/acpica/exresolv.c                           | 2 +-
 drivers/acpi/acpica/exresop.c                            | 2 +-
 drivers/acpi/acpica/exstore.c                            | 2 +-
 drivers/acpi/acpica/exstoren.c                           | 2 +-
 drivers/acpi/acpica/exstorob.c                           | 2 +-
 drivers/acpi/acpica/exsystem.c                           | 2 +-
 drivers/acpi/acpica/extrace.c                            | 2 +-
 drivers/acpi/acpica/exutils.c                            | 2 +-
 drivers/acpi/acpica/hwacpi.c                             | 2 +-
 drivers/acpi/acpica/hwesleep.c                           | 2 +-
 drivers/acpi/acpica/hwgpe.c                              | 2 +-
 drivers/acpi/acpica/hwpci.c                              | 2 +-
 drivers/acpi/acpica/hwregs.c                             | 2 +-
 drivers/acpi/acpica/hwsleep.c                            | 2 +-
 drivers/acpi/acpica/hwtimer.c                            | 2 +-
 drivers/acpi/acpica/hwvalid.c                            | 2 +-
 drivers/acpi/acpica/hwxface.c                            | 2 +-
 drivers/acpi/acpica/hwxfsleep.c                          | 2 +-
 drivers/acpi/acpica/nsaccess.c                           | 2 +-
 drivers/acpi/acpica/nsalloc.c                            | 2 +-
 drivers/acpi/acpica/nsarguments.c                        | 2 +-
 drivers/acpi/acpica/nsconvert.c                          | 2 +-
 drivers/acpi/acpica/nsdump.c                             | 2 +-
 drivers/acpi/acpica/nsdumpdv.c                           | 2 +-
 drivers/acpi/acpica/nseval.c                             | 2 +-
 drivers/acpi/acpica/nsinit.c                             | 2 +-
 drivers/acpi/acpica/nsload.c                             | 2 +-
 drivers/acpi/acpica/nsnames.c                            | 2 +-
 drivers/acpi/acpica/nsobject.c                           | 2 +-
 drivers/acpi/acpica/nsparse.c                            | 2 +-
 drivers/acpi/acpica/nspredef.c                           | 2 +-
 drivers/acpi/acpica/nsprepkg.c                           | 2 +-
 drivers/acpi/acpica/nsrepair.c                           | 2 +-
 drivers/acpi/acpica/nsrepair2.c                          | 2 +-
 drivers/acpi/acpica/nssearch.c                           | 2 +-
 drivers/acpi/acpica/nsutils.c                            | 2 +-
 drivers/acpi/acpica/nswalk.c                             | 2 +-
 drivers/acpi/acpica/nsxfeval.c                           | 2 +-
 drivers/acpi/acpica/nsxfname.c                           | 2 +-
 drivers/acpi/acpica/nsxfobj.c                            | 2 +-
 drivers/acpi/acpica/psargs.c                             | 2 +-
 drivers/acpi/acpica/psloop.c                             | 2 +-
 drivers/acpi/acpica/psobject.c                           | 2 +-
 drivers/acpi/acpica/psopcode.c                           | 2 +-
 drivers/acpi/acpica/psopinfo.c                           | 2 +-
 drivers/acpi/acpica/psparse.c                            | 2 +-
 drivers/acpi/acpica/psscope.c                            | 2 +-
 drivers/acpi/acpica/pstree.c                             | 2 +-
 drivers/acpi/acpica/psutils.c                            | 2 +-
 drivers/acpi/acpica/pswalk.c                             | 2 +-
 drivers/acpi/acpica/psxface.c                            | 2 +-
 drivers/acpi/acpica/rsaddr.c                             | 2 +-
 drivers/acpi/acpica/rscalc.c                             | 2 +-
 drivers/acpi/acpica/rscreate.c                           | 2 +-
 drivers/acpi/acpica/rsdump.c                             | 2 +-
 drivers/acpi/acpica/rsdumpinfo.c                         | 2 +-
 drivers/acpi/acpica/rsinfo.c                             | 2 +-
 drivers/acpi/acpica/rsio.c                               | 2 +-
 drivers/acpi/acpica/rsirq.c                              | 2 +-
 drivers/acpi/acpica/rslist.c                             | 2 +-
 drivers/acpi/acpica/rsmemory.c                           | 2 +-
 drivers/acpi/acpica/rsmisc.c                             | 2 +-
 drivers/acpi/acpica/rsserial.c                           | 2 +-
 drivers/acpi/acpica/rsutils.c                            | 2 +-
 drivers/acpi/acpica/rsxface.c                            | 2 +-
 drivers/acpi/acpica/tbdata.c                             | 2 +-
 drivers/acpi/acpica/tbfadt.c                             | 2 +-
 drivers/acpi/acpica/tbfind.c                             | 2 +-
 drivers/acpi/acpica/tbinstal.c                           | 2 +-
 drivers/acpi/acpica/tbprint.c                            | 2 +-
 drivers/acpi/acpica/tbutils.c                            | 2 +-
 drivers/acpi/acpica/tbxface.c                            | 2 +-
 drivers/acpi/acpica/tbxfload.c                           | 2 +-
 drivers/acpi/acpica/tbxfroot.c                           | 2 +-
 drivers/acpi/acpica/utaddress.c                          | 2 +-
 drivers/acpi/acpica/utalloc.c                            | 2 +-
 drivers/acpi/acpica/utascii.c                            | 2 +-
 drivers/acpi/acpica/utbuffer.c                           | 2 +-
 drivers/acpi/acpica/utcache.c                            | 2 +-
 drivers/acpi/acpica/utcopy.c                             | 2 +-
 drivers/acpi/acpica/utdebug.c                            | 2 +-
 drivers/acpi/acpica/utdecode.c                           | 2 +-
 drivers/acpi/acpica/utdelete.c                           | 2 +-
 drivers/acpi/acpica/uterror.c                            | 2 +-
 drivers/acpi/acpica/uteval.c                             | 2 +-
 drivers/acpi/acpica/utexcep.c                            | 2 +-
 drivers/acpi/acpica/utglobal.c                           | 2 +-
 drivers/acpi/acpica/uthex.c                              | 2 +-
 drivers/acpi/acpica/utids.c                              | 2 +-
 drivers/acpi/acpica/utinit.c                             | 2 +-
 drivers/acpi/acpica/utlock.c                             | 2 +-
 drivers/acpi/acpica/utmath.c                             | 2 +-
 drivers/acpi/acpica/utmisc.c                             | 2 +-
 drivers/acpi/acpica/utmutex.c                            | 2 +-
 drivers/acpi/acpica/utnonansi.c                          | 2 +-
 drivers/acpi/acpica/utobject.c                           | 2 +-
 drivers/acpi/acpica/utosi.c                              | 2 +-
 drivers/acpi/acpica/utownerid.c                          | 2 +-
 drivers/acpi/acpica/utpredef.c                           | 2 +-
 drivers/acpi/acpica/utprint.c                            | 2 +-
 drivers/acpi/acpica/utresdecode.c                        | 2 +-
 drivers/acpi/acpica/utresrc.c                            | 2 +-
 drivers/acpi/acpica/utstate.c                            | 2 +-
 drivers/acpi/acpica/utstring.c                           | 2 +-
 drivers/acpi/acpica/utstrsuppt.c                         | 2 +-
 drivers/acpi/acpica/utstrtoul64.c                        | 2 +-
 drivers/acpi/acpica/uttrack.c                            | 2 +-
 drivers/acpi/acpica/utuuid.c                             | 2 +-
 drivers/acpi/acpica/utxface.c                            | 2 +-
 drivers/acpi/acpica/utxferror.c                          | 2 +-
 drivers/acpi/acpica/utxfinit.c                           | 2 +-
 drivers/acpi/acpica/utxfmutex.c                          | 2 +-
 include/acpi/acbuffer.h                                  | 2 +-
 include/acpi/acconfig.h                                  | 2 +-
 include/acpi/acexcep.h                                   | 2 +-
 include/acpi/acnames.h                                   | 2 +-
 include/acpi/acoutput.h                                  | 2 +-
 include/acpi/acpi.h                                      | 2 +-
 include/acpi/acpiosxf.h                                  | 2 +-
 include/acpi/acpixf.h                                    | 2 +-
 include/acpi/acrestyp.h                                  | 2 +-
 include/acpi/actbl.h                                     | 2 +-
 include/acpi/actbl1.h                                    | 2 +-
 include/acpi/actbl2.h                                    | 2 +-
 include/acpi/actbl3.h                                    | 2 +-
 include/acpi/actypes.h                                   | 2 +-
 include/acpi/acuuid.h                                    | 2 +-
 include/acpi/platform/acenv.h                            | 2 +-
 include/acpi/platform/acenvex.h                          | 2 +-
 include/acpi/platform/acgcc.h                            | 2 +-
 include/acpi/platform/acgccex.h                          | 2 +-
 include/acpi/platform/acintel.h                          | 2 +-
 include/acpi/platform/aclinux.h                          | 2 +-
 include/acpi/platform/aclinuxex.h                        | 2 +-
 tools/power/acpi/common/cmfsize.c                        | 2 +-
 tools/power/acpi/common/getopt.c                         | 2 +-
 tools/power/acpi/os_specific/service_layers/oslinuxtbl.c | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixdir.c  | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixmap.c  | 2 +-
 tools/power/acpi/os_specific/service_layers/osunixxf.c   | 2 +-
 tools/power/acpi/tools/acpidump/acpidump.h               | 2 +-
 tools/power/acpi/tools/acpidump/apdump.c                 | 2 +-
 tools/power/acpi/tools/acpidump/apfiles.c                | 2 +-
 tools/power/acpi/tools/acpidump/apmain.c                 | 2 +-
 229 files changed, 230 insertions(+), 230 deletions(-)

diff --git a/drivers/acpi/acpica/acapps.h b/drivers/acpi/acpica/acapps.h
index 2243c8164b341..e65478593f9ae 100644
--- a/drivers/acpi/acpica/acapps.h
+++ b/drivers/acpi/acpica/acapps.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -51,7 +51,7 @@
 /* Common info for tool signons */
 
 #define ACPICA_NAME                 "Intel ACPI Component Architecture"
-#define ACPICA_COPYRIGHT            "Copyright (c) 2000 - 2017 Intel Corporation"
+#define ACPICA_COPYRIGHT            "Copyright (c) 2000 - 2018 Intel Corporation"
 
 #if ACPI_MACHINE_WIDTH == 64
 #define ACPI_WIDTH          " (64-bit version)"
diff --git a/drivers/acpi/acpica/accommon.h b/drivers/acpi/acpica/accommon.h
index 49bf47ca54777..c349ffdf55578 100644
--- a/drivers/acpi/acpica/accommon.h
+++ b/drivers/acpi/acpica/accommon.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acconvert.h b/drivers/acpi/acpica/acconvert.h
index c84223b60b356..ce6e8db83e274 100644
--- a/drivers/acpi/acpica/acconvert.h
+++ b/drivers/acpi/acpica/acconvert.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h
index 54b8d9df9423c..8b2cca5a717b7 100644
--- a/drivers/acpi/acpica/acdebug.h
+++ b/drivers/acpi/acpica/acdebug.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acdispat.h b/drivers/acpi/acpica/acdispat.h
index f8f3a6e74128a..fab590bc5fd3e 100644
--- a/drivers/acpi/acpica/acdispat.h
+++ b/drivers/acpi/acpica/acdispat.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h
index a2adfd42f85cc..1b0269f6ac2dd 100644
--- a/drivers/acpi/acpica/acevents.h
+++ b/drivers/acpi/acpica/acevents.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 45ef3f5dc9ad2..27f322b2fed1a 100644
--- a/drivers/acpi/acpica/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h
index cd722d8edacbb..3569aa3bf5ee8 100644
--- a/drivers/acpi/acpica/achware.h
+++ b/drivers/acpi/acpica/achware.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acinterp.h b/drivers/acpi/acpica/acinterp.h
index 29555c8789a31..744374ab92853 100644
--- a/drivers/acpi/acpica/acinterp.h
+++ b/drivers/acpi/acpica/acinterp.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h
index a56675f0661eb..3ba3ff0f1c042 100644
--- a/drivers/acpi/acpica/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acmacros.h b/drivers/acpi/acpica/acmacros.h
index 128a3d71b5986..6463340c45220 100644
--- a/drivers/acpi/acpica/acmacros.h
+++ b/drivers/acpi/acpica/acmacros.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h
index 2fb1bb78d85c6..6c8f364fe2fc5 100644
--- a/drivers/acpi/acpica/acnamesp.h
+++ b/drivers/acpi/acpica/acnamesp.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h
index 5226146190bf9..a1f4d3f385c82 100644
--- a/drivers/acpi/acpica/acobject.h
+++ b/drivers/acpi/acpica/acobject.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acopcode.h b/drivers/acpi/acpica/acopcode.h
index cbd59a302679e..36c2c58259866 100644
--- a/drivers/acpi/acpica/acopcode.h
+++ b/drivers/acpi/acpica/acopcode.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acparser.h b/drivers/acpi/acpica/acparser.h
index c23c47328060b..e25634951d038 100644
--- a/drivers/acpi/acpica/acparser.h
+++ b/drivers/acpi/acpica/acparser.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h
index cdfcad8eb74c8..7c27bcee6ac75 100644
--- a/drivers/acpi/acpica/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acresrc.h b/drivers/acpi/acpica/acresrc.h
index 438f3098a093a..20f36949928ac 100644
--- a/drivers/acpi/acpica/acresrc.h
+++ b/drivers/acpi/acpica/acresrc.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acstruct.h b/drivers/acpi/acpica/acstruct.h
index 62134bdbeda65..0338ac32f9c6a 100644
--- a/drivers/acpi/acpica/acstruct.h
+++ b/drivers/acpi/acpica/acstruct.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h
index 84a3ceb6e3842..15b23414245aa 100644
--- a/drivers/acpi/acpica/actables.h
+++ b/drivers/acpi/acpica/actables.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h
index b6b29d7178246..00d21d2f766ea 100644
--- a/drivers/acpi/acpica/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/amlcode.h b/drivers/acpi/acpica/amlcode.h
index f54dc5a34bdc4..b0e9492a6297e 100644
--- a/drivers/acpi/acpica/amlcode.h
+++ b/drivers/acpi/acpica/amlcode.h
@@ -7,7 +7,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h
index 1236e9a414e4d..b680c229ddd59 100644
--- a/drivers/acpi/acpica/amlresrc.h
+++ b/drivers/acpi/acpica/amlresrc.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbcmds.c b/drivers/acpi/acpica/dbcmds.c
index 5984b90eb5907..4112c85f2aab3 100644
--- a/drivers/acpi/acpica/dbcmds.c
+++ b/drivers/acpi/acpica/dbcmds.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbconvert.c b/drivers/acpi/acpica/dbconvert.c
index 32d546f0db2f5..27236a6c51ff3 100644
--- a/drivers/acpi/acpica/dbconvert.c
+++ b/drivers/acpi/acpica/dbconvert.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbdisply.c b/drivers/acpi/acpica/dbdisply.c
index 5a606eac0c224..7df920cda77dd 100644
--- a/drivers/acpi/acpica/dbdisply.c
+++ b/drivers/acpi/acpica/dbdisply.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbexec.c b/drivers/acpi/acpica/dbexec.c
index ed088fceb18d3..8ad9e6d9e54b3 100644
--- a/drivers/acpi/acpica/dbexec.c
+++ b/drivers/acpi/acpica/dbexec.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbfileio.c b/drivers/acpi/acpica/dbfileio.c
index cf9607945704c..084bb332f8e26 100644
--- a/drivers/acpi/acpica/dbfileio.c
+++ b/drivers/acpi/acpica/dbfileio.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbhistry.c b/drivers/acpi/acpica/dbhistry.c
index 7d08974c64c2c..55c0f2742339e 100644
--- a/drivers/acpi/acpica/dbhistry.c
+++ b/drivers/acpi/acpica/dbhistry.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbinput.c b/drivers/acpi/acpica/dbinput.c
index 954ca3b981a73..f7c661e06f379 100644
--- a/drivers/acpi/acpica/dbinput.c
+++ b/drivers/acpi/acpica/dbinput.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbmethod.c b/drivers/acpi/acpica/dbmethod.c
index df62c9245efc2..2cda0bff6f2cd 100644
--- a/drivers/acpi/acpica/dbmethod.c
+++ b/drivers/acpi/acpica/dbmethod.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbnames.c b/drivers/acpi/acpica/dbnames.c
index 8c207c7725179..8796fc1e0360e 100644
--- a/drivers/acpi/acpica/dbnames.c
+++ b/drivers/acpi/acpica/dbnames.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbobject.c b/drivers/acpi/acpica/dbobject.c
index e7b415c20aa88..d2063cbab39af 100644
--- a/drivers/acpi/acpica/dbobject.c
+++ b/drivers/acpi/acpica/dbobject.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbstats.c b/drivers/acpi/acpica/dbstats.c
index 99fb0160b8fb5..d6aaef54e3693 100644
--- a/drivers/acpi/acpica/dbstats.c
+++ b/drivers/acpi/acpica/dbstats.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbtest.c b/drivers/acpi/acpica/dbtest.c
index c6bee6143266b..56e446b89d18b 100644
--- a/drivers/acpi/acpica/dbtest.c
+++ b/drivers/acpi/acpica/dbtest.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbutils.c b/drivers/acpi/acpica/dbutils.c
index bfa972b641719..cd40854ee9be6 100644
--- a/drivers/acpi/acpica/dbutils.c
+++ b/drivers/acpi/acpica/dbutils.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dbxface.c b/drivers/acpi/acpica/dbxface.c
index b6985323e7eb9..77bbfa97cf913 100644
--- a/drivers/acpi/acpica/dbxface.c
+++ b/drivers/acpi/acpica/dbxface.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsargs.c b/drivers/acpi/acpica/dsargs.c
index 2873455c986d7..04a9f60e7ad10 100644
--- a/drivers/acpi/acpica/dsargs.c
+++ b/drivers/acpi/acpica/dsargs.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dscontrol.c b/drivers/acpi/acpica/dscontrol.c
index 4b6ebc2a28510..606697e741a51 100644
--- a/drivers/acpi/acpica/dscontrol.c
+++ b/drivers/acpi/acpica/dscontrol.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsdebug.c b/drivers/acpi/acpica/dsdebug.c
index d1f457eda9805..14ec52eba4087 100644
--- a/drivers/acpi/acpica/dsdebug.c
+++ b/drivers/acpi/acpica/dsdebug.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c
index 0cab34a593d50..95ea639a94242 100644
--- a/drivers/acpi/acpica/dsfield.c
+++ b/drivers/acpi/acpica/dsfield.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsinit.c b/drivers/acpi/acpica/dsinit.c
index b1842dd4edf7e..946ff2e130d93 100644
--- a/drivers/acpi/acpica/dsinit.c
+++ b/drivers/acpi/acpica/dsinit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c
index d7fc36917c671..b9c460c2d7636 100644
--- a/drivers/acpi/acpica/dsmethod.c
+++ b/drivers/acpi/acpica/dsmethod.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsmthdat.c b/drivers/acpi/acpica/dsmthdat.c
index 27a7de95f7b0a..157f1645d91a5 100644
--- a/drivers/acpi/acpica/dsmthdat.c
+++ b/drivers/acpi/acpica/dsmthdat.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsobject.c b/drivers/acpi/acpica/dsobject.c
index b21fe084ffc8c..4fa3400a95ba0 100644
--- a/drivers/acpi/acpica/dsobject.c
+++ b/drivers/acpi/acpica/dsobject.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c
index 0336df7ac47dd..0181cd3177511 100644
--- a/drivers/acpi/acpica/dsopcode.c
+++ b/drivers/acpi/acpica/dsopcode.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dspkginit.c b/drivers/acpi/acpica/dspkginit.c
index 5a602b75084ef..902bee78036c2 100644
--- a/drivers/acpi/acpica/dspkginit.c
+++ b/drivers/acpi/acpica/dspkginit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dsutils.c b/drivers/acpi/acpica/dsutils.c
index 4c5faf629a831..a4ce0b4a55a61 100644
--- a/drivers/acpi/acpica/dsutils.c
+++ b/drivers/acpi/acpica/dsutils.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c
index 22f45d0907332..2c07d220a50fe 100644
--- a/drivers/acpi/acpica/dswexec.c
+++ b/drivers/acpi/acpica/dswexec.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c
index 5771e4e4a99af..fa4ef9229e171 100644
--- a/drivers/acpi/acpica/dswload.c
+++ b/drivers/acpi/acpica/dswload.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dswload2.c b/drivers/acpi/acpica/dswload2.c
index b3d0aaec8203a..3b1313ba60d0a 100644
--- a/drivers/acpi/acpica/dswload2.c
+++ b/drivers/acpi/acpica/dswload2.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dswscope.c b/drivers/acpi/acpica/dswscope.c
index 3e081983d2ee6..8b5c3613c0608 100644
--- a/drivers/acpi/acpica/dswscope.c
+++ b/drivers/acpi/acpica/dswscope.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/dswstate.c b/drivers/acpi/acpica/dswstate.c
index da111a1f5bfbc..ee002d17526e0 100644
--- a/drivers/acpi/acpica/dswstate.c
+++ b/drivers/acpi/acpica/dswstate.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c
index d3b6b314fa507..4b2b0b44a16b6 100644
--- a/drivers/acpi/acpica/evevent.c
+++ b/drivers/acpi/acpica/evevent.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c
index 0ce33b0f430c4..012b80de15013 100644
--- a/drivers/acpi/acpica/evglock.c
+++ b/drivers/acpi/acpica/evglock.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c
index 263d8fc4a9e2f..410a3907c0518 100644
--- a/drivers/acpi/acpica/evgpe.c
+++ b/drivers/acpi/acpica/evgpe.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c
index 3a3cb8624f419..7ce756cc28abb 100644
--- a/drivers/acpi/acpica/evgpeblk.c
+++ b/drivers/acpi/acpica/evgpeblk.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c
index 8649c6242478e..8ad4816c99500 100644
--- a/drivers/acpi/acpica/evgpeinit.c
+++ b/drivers/acpi/acpica/evgpeinit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evgpeutil.c b/drivers/acpi/acpica/evgpeutil.c
index c8adb400330af..729a8960a3af4 100644
--- a/drivers/acpi/acpica/evgpeutil.c
+++ b/drivers/acpi/acpica/evgpeutil.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evhandler.c b/drivers/acpi/acpica/evhandler.c
index 2db61ef1b4a3b..20fb51c06b8dd 100644
--- a/drivers/acpi/acpica/evhandler.c
+++ b/drivers/acpi/acpica/evhandler.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c
index 4f6bb3f016abd..40d0b1f541a04 100644
--- a/drivers/acpi/acpica/evmisc.c
+++ b/drivers/acpi/acpica/evmisc.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c
index bb58419f0d610..de196c8e3f304 100644
--- a/drivers/acpi/acpica/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evrgnini.c b/drivers/acpi/acpica/evrgnini.c
index 93ec528bcd9a9..4187f563fede2 100644
--- a/drivers/acpi/acpica/evrgnini.c
+++ b/drivers/acpi/acpica/evrgnini.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evsci.c b/drivers/acpi/acpica/evsci.c
index 8ce73b962006f..d5594f79f877a 100644
--- a/drivers/acpi/acpica/evsci.c
+++ b/drivers/acpi/acpica/evsci.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c
index dd1b9dd64cef8..9b3c01bf1438e 100644
--- a/drivers/acpi/acpica/evxface.c
+++ b/drivers/acpi/acpica/evxface.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c
index c773ac4892cb7..96c2520f95706 100644
--- a/drivers/acpi/acpica/evxfevnt.c
+++ b/drivers/acpi/acpica/evxfevnt.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c
index 67c7c4ce276c7..cbb1598df9dcf 100644
--- a/drivers/acpi/acpica/evxfgpe.c
+++ b/drivers/acpi/acpica/evxfgpe.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c
index beba9d56a0d87..705fcd86151ae 100644
--- a/drivers/acpi/acpica/evxfregn.c
+++ b/drivers/acpi/acpica/evxfregn.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exconcat.c b/drivers/acpi/acpica/exconcat.c
index 59b8de2f07d39..ea20e10dd1f2b 100644
--- a/drivers/acpi/acpica/exconcat.c
+++ b/drivers/acpi/acpica/exconcat.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c
index 61813bd43f9e4..827f47b726632 100644
--- a/drivers/acpi/acpica/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exconvrt.c b/drivers/acpi/acpica/exconvrt.c
index 23ebadb06a95b..9abcc41a573bf 100644
--- a/drivers/acpi/acpica/exconvrt.c
+++ b/drivers/acpi/acpica/exconvrt.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/excreate.c b/drivers/acpi/acpica/excreate.c
index b8adb11f1b075..3dece45dd997e 100644
--- a/drivers/acpi/acpica/excreate.c
+++ b/drivers/acpi/acpica/excreate.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exdebug.c b/drivers/acpi/acpica/exdebug.c
index a8191d2ca5e36..d931a66a16e3c 100644
--- a/drivers/acpi/acpica/exdebug.c
+++ b/drivers/acpi/acpica/exdebug.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exdump.c b/drivers/acpi/acpica/exdump.c
index b2ff61bdb9a8f..4989ce9591aef 100644
--- a/drivers/acpi/acpica/exdump.c
+++ b/drivers/acpi/acpica/exdump.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c
index 5fda981f64988..e3b0650e5bb63 100644
--- a/drivers/acpi/acpica/exfield.c
+++ b/drivers/acpi/acpica/exfield.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c
index a656608dca849..3d0f274be88b2 100644
--- a/drivers/acpi/acpica/exfldio.c
+++ b/drivers/acpi/acpica/exfldio.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exmisc.c b/drivers/acpi/acpica/exmisc.c
index dbad3ebd7df50..1518fcb22ae12 100644
--- a/drivers/acpi/acpica/exmisc.c
+++ b/drivers/acpi/acpica/exmisc.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exmutex.c b/drivers/acpi/acpica/exmutex.c
index ecd95b3f35f19..24c9741dee482 100644
--- a/drivers/acpi/acpica/exmutex.c
+++ b/drivers/acpi/acpica/exmutex.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exnames.c b/drivers/acpi/acpica/exnames.c
index caa5ed1f65eca..c7b249cda5c0e 100644
--- a/drivers/acpi/acpica/exnames.c
+++ b/drivers/acpi/acpica/exnames.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c
index f787651348c11..dae01c93e4800 100644
--- a/drivers/acpi/acpica/exoparg1.c
+++ b/drivers/acpi/acpica/exoparg1.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exoparg2.c b/drivers/acpi/acpica/exoparg2.c
index 57980b7d35940..3cafa1d6f31a5 100644
--- a/drivers/acpi/acpica/exoparg2.c
+++ b/drivers/acpi/acpica/exoparg2.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exoparg3.c b/drivers/acpi/acpica/exoparg3.c
index ce857addc8dbd..f16c655121ffd 100644
--- a/drivers/acpi/acpica/exoparg3.c
+++ b/drivers/acpi/acpica/exoparg3.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exoparg6.c b/drivers/acpi/acpica/exoparg6.c
index 688032b58a213..8b39fffce6dcd 100644
--- a/drivers/acpi/acpica/exoparg6.c
+++ b/drivers/acpi/acpica/exoparg6.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exprep.c b/drivers/acpi/acpica/exprep.c
index 8de060664204e..1d1040f2e3f87 100644
--- a/drivers/acpi/acpica/exprep.c
+++ b/drivers/acpi/acpica/exprep.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c
index 7bcc9d809b7e9..387c438aa4853 100644
--- a/drivers/acpi/acpica/exregion.c
+++ b/drivers/acpi/acpica/exregion.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exresnte.c b/drivers/acpi/acpica/exresnte.c
index 91c1de046442f..77fa8d9aa5bf7 100644
--- a/drivers/acpi/acpica/exresnte.c
+++ b/drivers/acpi/acpica/exresnte.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exresolv.c b/drivers/acpi/acpica/exresolv.c
index 5e1854ea85f60..b104bc3ca809b 100644
--- a/drivers/acpi/acpica/exresolv.c
+++ b/drivers/acpi/acpica/exresolv.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exresop.c b/drivers/acpi/acpica/exresop.c
index 1c7c9962b0de7..2643d34f194d9 100644
--- a/drivers/acpi/acpica/exresop.c
+++ b/drivers/acpi/acpica/exresop.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c
index bdd43cde8f364..8f106bdcad5fb 100644
--- a/drivers/acpi/acpica/exstore.c
+++ b/drivers/acpi/acpica/exstore.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exstoren.c b/drivers/acpi/acpica/exstoren.c
index 56f59cf5da293..3d458d1996b0f 100644
--- a/drivers/acpi/acpica/exstoren.c
+++ b/drivers/acpi/acpica/exstoren.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exstorob.c b/drivers/acpi/acpica/exstorob.c
index 4ba7fcbf23b05..905443a3c28f7 100644
--- a/drivers/acpi/acpica/exstorob.c
+++ b/drivers/acpi/acpica/exstorob.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exsystem.c b/drivers/acpi/acpica/exsystem.c
index ad3b610057f3d..420d9b145d2ed 100644
--- a/drivers/acpi/acpica/exsystem.c
+++ b/drivers/acpi/acpica/exsystem.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/extrace.c b/drivers/acpi/acpica/extrace.c
index ae9df8672d9ef..9a67d507a132f 100644
--- a/drivers/acpi/acpica/extrace.c
+++ b/drivers/acpi/acpica/extrace.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/exutils.c b/drivers/acpi/acpica/exutils.c
index 34d608358eaf0..fb80d3f55d63f 100644
--- a/drivers/acpi/acpica/exutils.c
+++ b/drivers/acpi/acpica/exutils.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwacpi.c b/drivers/acpi/acpica/hwacpi.c
index fad249e774b41..68e958d4c25f2 100644
--- a/drivers/acpi/acpica/hwacpi.c
+++ b/drivers/acpi/acpica/hwacpi.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwesleep.c b/drivers/acpi/acpica/hwesleep.c
index 12626d021a9b5..64855b62a5aeb 100644
--- a/drivers/acpi/acpica/hwesleep.c
+++ b/drivers/acpi/acpica/hwesleep.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c
index 09b6822aa5cc2..c1c54af148d0b 100644
--- a/drivers/acpi/acpica/hwgpe.c
+++ b/drivers/acpi/acpica/hwgpe.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwpci.c b/drivers/acpi/acpica/hwpci.c
index 283819930be6b..faa2fa45eb1c8 100644
--- a/drivers/acpi/acpica/hwpci.c
+++ b/drivers/acpi/acpica/hwpci.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
index aa6e000819155..f3e7b7851a3a1 100644
--- a/drivers/acpi/acpica/hwregs.c
+++ b/drivers/acpi/acpica/hwregs.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 1fe7387a00e67..c85c373ecbc4e 100644
--- a/drivers/acpi/acpica/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwtimer.c b/drivers/acpi/acpica/hwtimer.c
index 5b4282902a839..511e3b8ffc6d9 100644
--- a/drivers/acpi/acpica/hwtimer.c
+++ b/drivers/acpi/acpica/hwtimer.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwvalid.c b/drivers/acpi/acpica/hwvalid.c
index d1679035d5f31..65d82e6add0b1 100644
--- a/drivers/acpi/acpica/hwvalid.c
+++ b/drivers/acpi/acpica/hwvalid.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c
index b3c6e439933c5..d320b129b7d7d 100644
--- a/drivers/acpi/acpica/hwxface.c
+++ b/drivers/acpi/acpica/hwxface.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c
index e5c095ca6083a..589c774bbf9a7 100644
--- a/drivers/acpi/acpica/hwxfsleep.c
+++ b/drivers/acpi/acpica/hwxfsleep.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsaccess.c b/drivers/acpi/acpica/nsaccess.c
index 33e652a12fca5..07f672b5a1d1b 100644
--- a/drivers/acpi/acpica/nsaccess.c
+++ b/drivers/acpi/acpica/nsaccess.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsalloc.c b/drivers/acpi/acpica/nsalloc.c
index 8ba5b32c9f717..ce57ccf4c1bf1 100644
--- a/drivers/acpi/acpica/nsalloc.c
+++ b/drivers/acpi/acpica/nsalloc.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsarguments.c b/drivers/acpi/acpica/nsarguments.c
index 67b7370dcae52..ce296ac14cf06 100644
--- a/drivers/acpi/acpica/nsarguments.c
+++ b/drivers/acpi/acpica/nsarguments.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsconvert.c b/drivers/acpi/acpica/nsconvert.c
index d55dcc82f4348..2f9d5d190fa96 100644
--- a/drivers/acpi/acpica/nsconvert.c
+++ b/drivers/acpi/acpica/nsconvert.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsdump.c b/drivers/acpi/acpica/nsdump.c
index 4123b5077a7d4..e2ac16818dc3b 100644
--- a/drivers/acpi/acpica/nsdump.c
+++ b/drivers/acpi/acpica/nsdump.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsdumpdv.c b/drivers/acpi/acpica/nsdumpdv.c
index 5026594763eaf..09ac00dee4506 100644
--- a/drivers/acpi/acpica/nsdumpdv.c
+++ b/drivers/acpi/acpica/nsdumpdv.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nseval.c b/drivers/acpi/acpica/nseval.c
index d22167cbd0ca6..c2d883b8c45e9 100644
--- a/drivers/acpi/acpica/nseval.c
+++ b/drivers/acpi/acpica/nseval.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c
index 9c62979497122..c17af4a3ab679 100644
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsload.c b/drivers/acpi/acpica/nsload.c
index d2915e186ae14..fdfe9309bd330 100644
--- a/drivers/acpi/acpica/nsload.c
+++ b/drivers/acpi/acpica/nsload.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsnames.c b/drivers/acpi/acpica/nsnames.c
index 22c92d1a24d8f..c686eda7ca66f 100644
--- a/drivers/acpi/acpica/nsnames.c
+++ b/drivers/acpi/acpica/nsnames.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsobject.c b/drivers/acpi/acpica/nsobject.c
index 707b2aa501e1b..757e44555ec30 100644
--- a/drivers/acpi/acpica/nsobject.c
+++ b/drivers/acpi/acpica/nsobject.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsparse.c b/drivers/acpi/acpica/nsparse.c
index 2fc33a5203f40..c5b22ea5b3695 100644
--- a/drivers/acpi/acpica/nsparse.c
+++ b/drivers/acpi/acpica/nsparse.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c
index 9d14b509529e3..4f1f6d6d9ddff 100644
--- a/drivers/acpi/acpica/nspredef.c
+++ b/drivers/acpi/acpica/nspredef.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsprepkg.c b/drivers/acpi/acpica/nsprepkg.c
index a8ea8fb1d2994..7805d5ce81272 100644
--- a/drivers/acpi/acpica/nsprepkg.c
+++ b/drivers/acpi/acpica/nsprepkg.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsrepair.c b/drivers/acpi/acpica/nsrepair.c
index 418ef2ac82abe..7b6b6d281f1cb 100644
--- a/drivers/acpi/acpica/nsrepair.c
+++ b/drivers/acpi/acpica/nsrepair.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c
index 06037e0446941..29c3973c78153 100644
--- a/drivers/acpi/acpica/nsrepair2.c
+++ b/drivers/acpi/acpica/nsrepair2.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nssearch.c b/drivers/acpi/acpica/nssearch.c
index e91dbee9235f3..a469447f5c02d 100644
--- a/drivers/acpi/acpica/nssearch.c
+++ b/drivers/acpi/acpica/nssearch.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c
index b43fe5fce64ba..0487fdb59b0ee 100644
--- a/drivers/acpi/acpica/nsutils.c
+++ b/drivers/acpi/acpica/nsutils.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nswalk.c b/drivers/acpi/acpica/nswalk.c
index 6b6e6f498cffb..dd7ae1bc8af8c 100644
--- a/drivers/acpi/acpica/nswalk.c
+++ b/drivers/acpi/acpica/nswalk.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c
index 9b51f65823b29..1075bd9541f56 100644
--- a/drivers/acpi/acpica/nsxfeval.c
+++ b/drivers/acpi/acpica/nsxfeval.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsxfname.c b/drivers/acpi/acpica/nsxfname.c
index 1069662358050..e9603fc9586ce 100644
--- a/drivers/acpi/acpica/nsxfname.c
+++ b/drivers/acpi/acpica/nsxfname.c
@@ -6,7 +6,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/nsxfobj.c b/drivers/acpi/acpica/nsxfobj.c
index 47f689ec3fcbc..ac1fbf767cac3 100644
--- a/drivers/acpi/acpica/nsxfobj.c
+++ b/drivers/acpi/acpica/nsxfobj.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c
index 171e2faa7c506..dbc51bc5fdd67 100644
--- a/drivers/acpi/acpica/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psloop.c b/drivers/acpi/acpica/psloop.c
index bb04dec168ad7..7dca287d7690a 100644
--- a/drivers/acpi/acpica/psloop.c
+++ b/drivers/acpi/acpica/psloop.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psobject.c b/drivers/acpi/acpica/psobject.c
index c0b179883ff25..b18f1e0489854 100644
--- a/drivers/acpi/acpica/psobject.c
+++ b/drivers/acpi/acpica/psobject.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psopcode.c b/drivers/acpi/acpica/psopcode.c
index a402ad772a1e5..d31f3eb232254 100644
--- a/drivers/acpi/acpica/psopcode.c
+++ b/drivers/acpi/acpica/psopcode.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c
index eff22950232b6..1dc1fc79297eb 100644
--- a/drivers/acpi/acpica/psopinfo.c
+++ b/drivers/acpi/acpica/psopinfo.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psparse.c b/drivers/acpi/acpica/psparse.c
index ac88319dc1114..2474ff9612940 100644
--- a/drivers/acpi/acpica/psparse.c
+++ b/drivers/acpi/acpica/psparse.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psscope.c b/drivers/acpi/acpica/psscope.c
index 22d7f1d6849b3..f49cdcc65700a 100644
--- a/drivers/acpi/acpica/psscope.c
+++ b/drivers/acpi/acpica/psscope.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/pstree.c b/drivers/acpi/acpica/pstree.c
index c06d6e2fc7a5d..f9fa88c79b32d 100644
--- a/drivers/acpi/acpica/pstree.c
+++ b/drivers/acpi/acpica/pstree.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psutils.c b/drivers/acpi/acpica/psutils.c
index cd59dfe6a47d9..fe151f42de3a5 100644
--- a/drivers/acpi/acpica/psutils.c
+++ b/drivers/acpi/acpica/psutils.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/pswalk.c b/drivers/acpi/acpica/pswalk.c
index 22a37c82af19a..bc5c779e54e80 100644
--- a/drivers/acpi/acpica/pswalk.c
+++ b/drivers/acpi/acpica/pswalk.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/psxface.c b/drivers/acpi/acpica/psxface.c
index c88a681586bf3..d2270ade5cf82 100644
--- a/drivers/acpi/acpica/psxface.c
+++ b/drivers/acpi/acpica/psxface.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsaddr.c b/drivers/acpi/acpica/rsaddr.c
index a131a28bb09d6..213bad89675b7 100644
--- a/drivers/acpi/acpica/rsaddr.c
+++ b/drivers/acpi/acpica/rsaddr.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rscalc.c b/drivers/acpi/acpica/rscalc.c
index 659fb718504a6..576f7aae162b4 100644
--- a/drivers/acpi/acpica/rscalc.c
+++ b/drivers/acpi/acpica/rscalc.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rscreate.c b/drivers/acpi/acpica/rscreate.c
index f72ff0b54a639..fe07001ea8651 100644
--- a/drivers/acpi/acpica/rscreate.c
+++ b/drivers/acpi/acpica/rscreate.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsdump.c b/drivers/acpi/acpica/rsdump.c
index 55fd1880efbec..bc4c4755aeb9e 100644
--- a/drivers/acpi/acpica/rsdump.c
+++ b/drivers/acpi/acpica/rsdump.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsdumpinfo.c b/drivers/acpi/acpica/rsdumpinfo.c
index da150e17795b9..c4a2a08e31acc 100644
--- a/drivers/acpi/acpica/rsdumpinfo.c
+++ b/drivers/acpi/acpica/rsdumpinfo.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsinfo.c b/drivers/acpi/acpica/rsinfo.c
index b0e50518d7666..e819bb0f45af5 100644
--- a/drivers/acpi/acpica/rsinfo.c
+++ b/drivers/acpi/acpica/rsinfo.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsio.c b/drivers/acpi/acpica/rsio.c
index b7a47fbc519b6..eafd993592f6a 100644
--- a/drivers/acpi/acpica/rsio.c
+++ b/drivers/acpi/acpica/rsio.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsirq.c b/drivers/acpi/acpica/rsirq.c
index 092a733c42b80..aabd73298eb8b 100644
--- a/drivers/acpi/acpica/rsirq.c
+++ b/drivers/acpi/acpica/rsirq.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rslist.c b/drivers/acpi/acpica/rslist.c
index 36a6657dd34d2..11214780ea8fd 100644
--- a/drivers/acpi/acpica/rslist.c
+++ b/drivers/acpi/acpica/rslist.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsmemory.c b/drivers/acpi/acpica/rsmemory.c
index 273eecb3001b4..05e375abc6b56 100644
--- a/drivers/acpi/acpica/rsmemory.c
+++ b/drivers/acpi/acpica/rsmemory.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsmisc.c b/drivers/acpi/acpica/rsmisc.c
index cc4b5486c4bcc..7b4627181cc6f 100644
--- a/drivers/acpi/acpica/rsmisc.c
+++ b/drivers/acpi/acpica/rsmisc.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsserial.c b/drivers/acpi/acpica/rsserial.c
index 14d12d6eb716f..87dac2812072b 100644
--- a/drivers/acpi/acpica/rsserial.c
+++ b/drivers/acpi/acpica/rsserial.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsutils.c b/drivers/acpi/acpica/rsutils.c
index b2aeca01204a0..49ff7f851d58c 100644
--- a/drivers/acpi/acpica/rsutils.c
+++ b/drivers/acpi/acpica/rsutils.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/rsxface.c b/drivers/acpi/acpica/rsxface.c
index be65e65e216eb..3b481f0b81c57 100644
--- a/drivers/acpi/acpica/rsxface.c
+++ b/drivers/acpi/acpica/rsxface.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c
index b19a2f0ea331d..ec69267f1447a 100644
--- a/drivers/acpi/acpica/tbdata.c
+++ b/drivers/acpi/acpica/tbdata.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 5f051d82188d1..d1763c5e4e913 100644
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbfind.c b/drivers/acpi/acpica/tbfind.c
index fea89c8d305cb..999a64a48e1a2 100644
--- a/drivers/acpi/acpica/tbfind.c
+++ b/drivers/acpi/acpica/tbfind.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index 0dfc0ac3c141f..00be16da1ee2c 100644
--- a/drivers/acpi/acpica/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c
index edfd7b10be19f..8cdcdd2c46975 100644
--- a/drivers/acpi/acpica/tbprint.c
+++ b/drivers/acpi/acpica/tbprint.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 0c6768d203958..30d40ff8992b4 100644
--- a/drivers/acpi/acpica/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c
index 5ecb8d2e68347..dca91b6f8cc2b 100644
--- a/drivers/acpi/acpica/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c
index d81f442228b8b..e09b4b26300e7 100644
--- a/drivers/acpi/acpica/tbxfload.c
+++ b/drivers/acpi/acpica/tbxfload.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c
index f9f9a7da2cade..abf3c62e1e800 100644
--- a/drivers/acpi/acpica/tbxfroot.c
+++ b/drivers/acpi/acpica/tbxfroot.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utaddress.c b/drivers/acpi/acpica/utaddress.c
index 26a0633115be3..d8540f380ae5f 100644
--- a/drivers/acpi/acpica/utaddress.c
+++ b/drivers/acpi/acpica/utaddress.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utalloc.c b/drivers/acpi/acpica/utalloc.c
index 5594a359dbf17..12fbaddbfb0db 100644
--- a/drivers/acpi/acpica/utalloc.c
+++ b/drivers/acpi/acpica/utalloc.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utascii.c b/drivers/acpi/acpica/utascii.c
index 909bdb1986515..95565e46a6956 100644
--- a/drivers/acpi/acpica/utascii.c
+++ b/drivers/acpi/acpica/utascii.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utbuffer.c b/drivers/acpi/acpica/utbuffer.c
index f17eaa009dde7..2c5a14c2f46bd 100644
--- a/drivers/acpi/acpica/utbuffer.c
+++ b/drivers/acpi/acpica/utbuffer.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utcache.c b/drivers/acpi/acpica/utcache.c
index 531493306dee9..08e6944404b3e 100644
--- a/drivers/acpi/acpica/utcache.c
+++ b/drivers/acpi/acpica/utcache.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utcopy.c b/drivers/acpi/acpica/utcopy.c
index e9382255d6c6c..01434af99035d 100644
--- a/drivers/acpi/acpica/utcopy.c
+++ b/drivers/acpi/acpica/utcopy.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c
index cff7154b7feea..2201be1bf4c29 100644
--- a/drivers/acpi/acpica/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c
index 55debbad487dc..1a3f316a18a8e 100644
--- a/drivers/acpi/acpica/utdecode.c
+++ b/drivers/acpi/acpica/utdecode.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c
index c6eb9fae70f9a..7d8d0208f0a35 100644
--- a/drivers/acpi/acpica/utdelete.c
+++ b/drivers/acpi/acpica/utdelete.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/uterror.c b/drivers/acpi/acpica/uterror.c
index 42388dcb5ccc9..ce5e891291bf8 100644
--- a/drivers/acpi/acpica/uterror.c
+++ b/drivers/acpi/acpica/uterror.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c
index 3fce7519c6902..b8be0b82a130e 100644
--- a/drivers/acpi/acpica/uteval.c
+++ b/drivers/acpi/acpica/uteval.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utexcep.c b/drivers/acpi/acpica/utexcep.c
index eb6dcab33d2f8..e3dbad8b73e56 100644
--- a/drivers/acpi/acpica/utexcep.c
+++ b/drivers/acpi/acpica/utexcep.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c
index 230a50c82f22c..933595b0e5942 100644
--- a/drivers/acpi/acpica/utglobal.c
+++ b/drivers/acpi/acpica/utglobal.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/uthex.c b/drivers/acpi/acpica/uthex.c
index fb406daf47fac..f5886d557a942 100644
--- a/drivers/acpi/acpica/uthex.c
+++ b/drivers/acpi/acpica/uthex.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utids.c b/drivers/acpi/acpica/utids.c
index a6eb580ee21d7..db3c3c1d33da1 100644
--- a/drivers/acpi/acpica/utids.c
+++ b/drivers/acpi/acpica/utids.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utinit.c b/drivers/acpi/acpica/utinit.c
index 45eeb0dcf283b..a2005b0303474 100644
--- a/drivers/acpi/acpica/utinit.c
+++ b/drivers/acpi/acpica/utinit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utlock.c b/drivers/acpi/acpica/utlock.c
index db2d9910866ee..0636074a4c23f 100644
--- a/drivers/acpi/acpica/utlock.c
+++ b/drivers/acpi/acpica/utlock.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utmath.c b/drivers/acpi/acpica/utmath.c
index 2055a858e5f59..eddf719904331 100644
--- a/drivers/acpi/acpica/utmath.c
+++ b/drivers/acpi/acpica/utmath.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utmisc.c b/drivers/acpi/acpica/utmisc.c
index 45c78c2adbf0f..a331313ad5fa2 100644
--- a/drivers/acpi/acpica/utmisc.c
+++ b/drivers/acpi/acpica/utmisc.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c
index 524ba931d5e83..6767bd1626f70 100644
--- a/drivers/acpi/acpica/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utnonansi.c b/drivers/acpi/acpica/utnonansi.c
index 33a0970646df5..94219610e259e 100644
--- a/drivers/acpi/acpica/utnonansi.c
+++ b/drivers/acpi/acpica/utnonansi.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utobject.c b/drivers/acpi/acpica/utobject.c
index cb3db9fed50d1..375901c0a596e 100644
--- a/drivers/acpi/acpica/utobject.c
+++ b/drivers/acpi/acpica/utobject.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utosi.c b/drivers/acpi/acpica/utosi.c
index f6b8dd24b006c..00ea104f6a0a9 100644
--- a/drivers/acpi/acpica/utosi.c
+++ b/drivers/acpi/acpica/utosi.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utownerid.c b/drivers/acpi/acpica/utownerid.c
index 1b3ee74a87ebf..9923dfa708be8 100644
--- a/drivers/acpi/acpica/utownerid.c
+++ b/drivers/acpi/acpica/utownerid.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utpredef.c b/drivers/acpi/acpica/utpredef.c
index 350709f23e4cf..ae6fef02b692b 100644
--- a/drivers/acpi/acpica/utpredef.c
+++ b/drivers/acpi/acpica/utpredef.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utprint.c b/drivers/acpi/acpica/utprint.c
index c008589b41bd9..ac07700f5b799 100644
--- a/drivers/acpi/acpica/utprint.c
+++ b/drivers/acpi/acpica/utprint.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utresdecode.c b/drivers/acpi/acpica/utresdecode.c
index e15a2538558b3..93fa3450ca884 100644
--- a/drivers/acpi/acpica/utresdecode.c
+++ b/drivers/acpi/acpica/utresdecode.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utresrc.c b/drivers/acpi/acpica/utresrc.c
index f9801d13547fe..4d289d9c734cf 100644
--- a/drivers/acpi/acpica/utresrc.c
+++ b/drivers/acpi/acpica/utresrc.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utstate.c b/drivers/acpi/acpica/utstate.c
index eafabcd2fada3..7750c48739d8c 100644
--- a/drivers/acpi/acpica/utstate.c
+++ b/drivers/acpi/acpica/utstate.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utstring.c b/drivers/acpi/acpica/utstring.c
index 9eacbcb9e4f45..a9507d1976ffa 100644
--- a/drivers/acpi/acpica/utstring.c
+++ b/drivers/acpi/acpica/utstring.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utstrsuppt.c b/drivers/acpi/acpica/utstrsuppt.c
index 97f48d71f9e60..6fc76f0b60e9d 100644
--- a/drivers/acpi/acpica/utstrsuppt.c
+++ b/drivers/acpi/acpica/utstrsuppt.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utstrtoul64.c b/drivers/acpi/acpica/utstrtoul64.c
index e2067dcb93893..9f7cef1de34aa 100644
--- a/drivers/acpi/acpica/utstrtoul64.c
+++ b/drivers/acpi/acpica/utstrtoul64.c
@@ -6,7 +6,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/uttrack.c b/drivers/acpi/acpica/uttrack.c
index 633b4e2c669f6..8cc70ca4e0fb8 100644
--- a/drivers/acpi/acpica/uttrack.c
+++ b/drivers/acpi/acpica/uttrack.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utuuid.c b/drivers/acpi/acpica/utuuid.c
index 5028e06718b1c..95946fdb55d5b 100644
--- a/drivers/acpi/acpica/utuuid.c
+++ b/drivers/acpi/acpica/utuuid.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utxface.c b/drivers/acpi/acpica/utxface.c
index 6b9ba4029f8e9..25ef2ce646036 100644
--- a/drivers/acpi/acpica/utxface.c
+++ b/drivers/acpi/acpica/utxface.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utxferror.c b/drivers/acpi/acpica/utxferror.c
index 9da4f8ef2e77b..a78861ded8940 100644
--- a/drivers/acpi/acpica/utxferror.c
+++ b/drivers/acpi/acpica/utxferror.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c
index 6d5180601cf2a..e727db52a55ed 100644
--- a/drivers/acpi/acpica/utxfinit.c
+++ b/drivers/acpi/acpica/utxfinit.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/drivers/acpi/acpica/utxfmutex.c b/drivers/acpi/acpica/utxfmutex.c
index 0b85f113f7262..764782fcf1bda 100644
--- a/drivers/acpi/acpica/utxfmutex.c
+++ b/drivers/acpi/acpica/utxfmutex.c
@@ -5,7 +5,7 @@
  ******************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acbuffer.h b/include/acpi/acbuffer.h
index c77b91ff1149c..f2eac81bfbd21 100644
--- a/include/acpi/acbuffer.h
+++ b/include/acpi/acbuffer.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h
index ffe364fa4040c..1adcda4bfb541 100644
--- a/include/acpi/acconfig.h
+++ b/include/acpi/acconfig.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h
index 3c46f0ef5f7a7..bb3e746dad342 100644
--- a/include/acpi/acexcep.h
+++ b/include/acpi/acexcep.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h
index d8dd3bf51ca73..d497b9bcd84ef 100644
--- a/include/acpi/acnames.h
+++ b/include/acpi/acnames.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index c2e664e74075c..6eceb69cffc4c 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h
index 0887d7cbb7e14..fae98927052fc 100644
--- a/include/acpi/acpi.h
+++ b/include/acpi/acpi.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index d5c0f5153c4e5..d6345e9870a1a 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -7,7 +7,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index c589c3e12d90b..285d0c602b800 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
index 343dbdcef20c6..1becc88da82f8 100644
--- a/include/acpi/acrestyp.h
+++ b/include/acpi/acrestyp.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index 89509b86cb54b..17c4d1fdc88d5 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 4c304bf4d5915..a398d5938f340 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 0d60d5df14f8b..366a491f2af03 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h
index 5bde2e700530e..ebad40eda9b78 100644
--- a/include/acpi/actbl3.h
+++ b/include/acpi/actbl3.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 4fed3310235a5..310b542abe238 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/acuuid.h b/include/acpi/acuuid.h
index b1a0a8a64c3dc..f0ba9bca2b12c 100644
--- a/include/acpi/acuuid.h
+++ b/include/acpi/acuuid.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h
index 043fd559de6ed..c33ec562b5853 100644
--- a/include/acpi/platform/acenv.h
+++ b/include/acpi/platform/acenv.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/acenvex.h b/include/acpi/platform/acenvex.h
index 127c848a1ba7c..c4b1b110aeb9b 100644
--- a/include/acpi/platform/acenvex.h
+++ b/include/acpi/platform/acenvex.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h
index 9c8f8b79644ed..666256f6c97d3 100644
--- a/include/acpi/platform/acgcc.h
+++ b/include/acpi/platform/acgcc.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/acgccex.h b/include/acpi/platform/acgccex.h
index 4f701b288cece..e7baa58b0e313 100644
--- a/include/acpi/platform/acgccex.h
+++ b/include/acpi/platform/acgccex.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/acintel.h b/include/acpi/platform/acintel.h
index bdb6858e24582..f0c5be8ba4d5b 100644
--- a/include/acpi/platform/acintel.h
+++ b/include/acpi/platform/acintel.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 97e088276c6d2..adee92c38c43d 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index efdff527f8fc7..b066d75a93597 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c
index 5b38dc2fec4f8..dfa6fee1b915b 100644
--- a/tools/power/acpi/common/cmfsize.c
+++ b/tools/power/acpi/common/cmfsize.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/common/getopt.c b/tools/power/acpi/common/getopt.c
index 6e78413bb2cbe..f7032c9ab35e1 100644
--- a/tools/power/acpi/common/getopt.c
+++ b/tools/power/acpi/common/getopt.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
index 52a39ecf5ca18..e7347edfd4f9f 100644
--- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
+++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixdir.c b/tools/power/acpi/os_specific/service_layers/osunixdir.c
index ea14eaeb268f3..9b5e61b636d92 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixdir.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixdir.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixmap.c b/tools/power/acpi/os_specific/service_layers/osunixmap.c
index cf9b5a54df921..8b26924e26020 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixmap.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixmap.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c
index 025c1b07049d5..34c044da433cf 100644
--- a/tools/power/acpi/os_specific/service_layers/osunixxf.c
+++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/acpidump.h b/tools/power/acpi/tools/acpidump/acpidump.h
index d6aa40fce2b14..3c679607d1c34 100644
--- a/tools/power/acpi/tools/acpidump/acpidump.h
+++ b/tools/power/acpi/tools/acpidump/acpidump.h
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c
index 0634449156d82..9ad1712e4cf94 100644
--- a/tools/power/acpi/tools/acpidump/apdump.c
+++ b/tools/power/acpi/tools/acpidump/apdump.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c
index d686e11936c45..856e1b83407ba 100644
--- a/tools/power/acpi/tools/acpidump/apfiles.c
+++ b/tools/power/acpi/tools/acpidump/apfiles.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/power/acpi/tools/acpidump/apmain.c b/tools/power/acpi/tools/acpidump/apmain.c
index be418fba9441b..f4ef826800cfb 100644
--- a/tools/power/acpi/tools/acpidump/apmain.c
+++ b/tools/power/acpi/tools/acpidump/apmain.c
@@ -5,7 +5,7 @@
  *****************************************************************************/
 
 /*
- * Copyright (C) 2000 - 2017, Intel Corp.
+ * Copyright (C) 2000 - 2018, Intel Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without

From 9481a2c5c1d9920938297e47509716fff43aa601 Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Wed, 24 Jan 2018 15:42:34 -0800
Subject: [PATCH 314/676] ACPICA: Update version to 20180105

Version 20180105.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Erik Schmauss <erik.schmauss@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 285d0c602b800..c2bf1255f5aa0 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -46,7 +46,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20171215
+#define ACPI_CA_VERSION                 0x20180105
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>

From 7b4eb53d95c30bdc55c44647c6968f24227fd1ba Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 5 Feb 2018 16:25:24 -0800
Subject: [PATCH 315/676] tools/bpf: fix batch-mode test failure of
 test_xdp_redirect.sh

The tests at tools/testing/selftests/bpf can run in patch mode, e.g.,
    make -C tools/testing/selftests/bpf run_tests

With the batch mode, I experimented intermittent test failure of
test_xdp_redirect.sh.
    ....
    selftests: test_xdp_redirect [PASS]
    selftests: test_xdp_redirect.sh [PASS]
    RTNETLINK answers: File exists
    selftests: test_xdp_meta [FAILED]
    selftests: test_xdp_meta.sh [FAIL]
    ....

The following illustrates what caused the failure:
     (1). test_xdp_redirect creates veth pairs (veth1,veth11) and
          (veth2,veth22), and assign veth11 and veth22 to namespace
          ns1 and ns2 respectively.
     (2). at the end of test_xdp_redirect test, ns1 and ns2 are
          deleted. During this process, the deletion of actual
          namespace resources, including deletion of veth1{1} and veth2{2},
          is put into a workqueue to be processed asynchronously.
     (3). test_xdp_meta tries to create veth pair (veth1, veth2).
          The previous veth deletions in step (2) have not finished yet,
          and veth1 or veth2 may be still valid in the kernel, thus
          causing the failure.

The fix is to explicitly delete the veth pair before test_xdp_redirect
exits. Only one end of veth needs deletion as the kernel will delete
the other end automatically. Also test_xdp_meta is also fixed in
similar manner to avoid future potential issues.

Fixes: 996139e801fd ("selftests: bpf: add a test for XDP redirect")
Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_xdp_meta.sh     | 1 +
 tools/testing/selftests/bpf/test_xdp_redirect.sh | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
index 307aa856cee3e..637fcf4fe4e30 100755
--- a/tools/testing/selftests/bpf/test_xdp_meta.sh
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -9,6 +9,7 @@ cleanup()
 	fi
 
 	set +e
+	ip link del veth1 2> /dev/null
 	ip netns del ns1 2> /dev/null
 	ip netns del ns2 2> /dev/null
 }
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
index 344a3656dea6d..c4b17e08d431b 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -19,6 +19,8 @@ cleanup()
 	fi
 
 	set +e
+	ip link del veth1 2> /dev/null
+	ip link del veth2 2> /dev/null
 	ip netns del ns1 2> /dev/null
 	ip netns del ns2 2> /dev/null
 }

From b11a632c442eef34a0afeba61fab923241f317e9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:43 -0800
Subject: [PATCH 316/676] net: add a UID to use for ULP socket assignment

Create a UID field and enum that can be used to assign ULPs to
sockets. This saves a set of string comparisons if the ULP id
is known.

For sockmap, which is added in the next patches, a ULP is used to
hook into TCP sockets close state. In this case the ULP being added
is done at map insert time and the ULP is known and done on the kernel
side. In this case the named lookup is not needed. Because we don't
want to expose psock internals to user space socket options a user
visible flag is also added. For TLS this is set for BPF it will be
cleared.

Alos remove pr_notice, user gets an error code back and should check
that rather than rely on logs.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h  |  6 +++++
 net/ipv4/tcp_ulp.c | 59 ++++++++++++++++++++++++++++++++++++++++++----
 net/tls/tls_main.c |  2 ++
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 58278669cc557..a58292d31e125 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1983,6 +1983,10 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 #define TCP_ULP_MAX		128
 #define TCP_ULP_BUF_MAX		(TCP_ULP_NAME_MAX*TCP_ULP_MAX)
 
+enum {
+	TCP_ULP_TLS,
+};
+
 struct tcp_ulp_ops {
 	struct list_head	list;
 
@@ -1991,7 +1995,9 @@ struct tcp_ulp_ops {
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
+	int		uid;
 	char		name[TCP_ULP_NAME_MAX];
+	bool		user_visible;
 	struct module	*owner;
 };
 int tcp_register_ulp(struct tcp_ulp_ops *type);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 6bb9e14c710a7..622caa4039e02 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -29,6 +29,18 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
 	return NULL;
 }
 
+static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
+{
+	struct tcp_ulp_ops *e;
+
+	list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+		if (e->uid == ulp)
+			return e;
+	}
+
+	return NULL;
+}
+
 static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 {
 	const struct tcp_ulp_ops *ulp = NULL;
@@ -51,6 +63,18 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
 	return ulp;
 }
 
+static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
+{
+	const struct tcp_ulp_ops *ulp;
+
+	rcu_read_lock();
+	ulp = tcp_ulp_find_id(uid);
+	if (!ulp || !try_module_get(ulp->owner))
+		ulp = NULL;
+	rcu_read_unlock();
+	return ulp;
+}
+
 /* Attach new upper layer protocol to the list
  * of available protocols.
  */
@@ -59,13 +83,10 @@ int tcp_register_ulp(struct tcp_ulp_ops *ulp)
 	int ret = 0;
 
 	spin_lock(&tcp_ulp_list_lock);
-	if (tcp_ulp_find(ulp->name)) {
-		pr_notice("%s already registered or non-unique name\n",
-			  ulp->name);
+	if (tcp_ulp_find(ulp->name))
 		ret = -EEXIST;
-	} else {
+	else
 		list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
-	}
 	spin_unlock(&tcp_ulp_list_lock);
 
 	return ret;
@@ -124,6 +145,34 @@ int tcp_set_ulp(struct sock *sk, const char *name)
 	if (!ulp_ops)
 		return -ENOENT;
 
+	if (!ulp_ops->user_visible) {
+		module_put(ulp_ops->owner);
+		return -ENOENT;
+	}
+
+	err = ulp_ops->init(sk);
+	if (err) {
+		module_put(ulp_ops->owner);
+		return err;
+	}
+
+	icsk->icsk_ulp_ops = ulp_ops;
+	return 0;
+}
+
+int tcp_set_ulp_id(struct sock *sk, int ulp)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct tcp_ulp_ops *ulp_ops;
+	int err;
+
+	if (icsk->icsk_ulp_ops)
+		return -EEXIST;
+
+	ulp_ops = __tcp_ulp_lookup(ulp);
+	if (!ulp_ops)
+		return -ENOENT;
+
 	err = ulp_ops->init(sk);
 	if (err) {
 		module_put(ulp_ops->owner);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 736719c8314e8..b0d5fcea47e73 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -484,6 +484,8 @@ static int tls_init(struct sock *sk)
 
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
+	.uid			= TCP_ULP_TLS,
+	.user_visible		= true,
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
 };

From 1aa12bdf1bfb95db7e75bfecf0e39a65f4e8fbf8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:49 -0800
Subject: [PATCH 317/676] bpf: sockmap, add sock close() hook to remove socks

The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.

The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.

To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/net/tcp.h    |   2 +
 kernel/bpf/sockmap.c | 168 ++++++++++++++++++++++++++-----------------
 2 files changed, 103 insertions(+), 67 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index a58292d31e125..e3fc667f9ac26 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
 
 enum {
 	TCP_ULP_TLS,
+	TCP_ULP_BPF,
 };
 
 struct tcp_ulp_ops {
@@ -2003,6 +2004,7 @@ struct tcp_ulp_ops {
 int tcp_register_ulp(struct tcp_ulp_ops *type);
 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
+int tcp_set_ulp_id(struct sock *sk, const int ulp);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d1783d77a..bd4a6d9c67095 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@ struct smap_psock {
 	struct work_struct tx_work;
 	struct work_struct gc_work;
 
+	struct proto *sk_proto;
+	void (*save_close)(struct sock *sk, long timeout);
 	void (*save_data_ready)(struct sock *sk);
 	void (*save_write_space)(struct sock *sk);
-	void (*save_state_change)(struct sock *sk);
 };
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+static struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	if (unlikely(psock->sk_proto)) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
+
+	psock->save_close = sk->sk_prot->close;
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = &tcp_bpf_proto;
+	rcu_read_unlock();
+	return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+
+	if (likely(psock)) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+	rcu_read_unlock();
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+	void (*close_fun)(struct sock *sk, long timeout);
+	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	/* The psock may be destroyed anytime after exiting the RCU critial
+	 * section so by the time we use close_fun the psock may no longer
+	 * be valid. However, bpf_tcp_close is called with the sock lock
+	 * held so the close hook and sk are still valid.
+	 */
+	close_fun = psock->save_close;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		osk = cmpxchg(e->entry, sk, NULL);
+		if (osk == sk) {
+			list_del(&e->list);
+			smap_release_sock(psock, sk);
+		}
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	rcu_read_unlock();
+	close_fun(sk, timeout);
+}
+
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,
 	__SK_REDIRECT,
 };
 
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+	.name		= "bpf_tcp",
+	.uid		= TCP_ULP_BPF,
+	.user_visible	= false,
+	.owner		= NULL,
+	.init		= bpf_tcp_init,
+	.release	= bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+	tcp_bpf_proto = tcp_prot;
+	tcp_bpf_proto.close = bpf_tcp_close;
+	return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct smap_psock *psock;
-	struct socket_wq *wq;
-	struct sock *osk;
-
-	rcu_read_lock();
-
-	/* Allowing transitions into an established syn_recv states allows
-	 * for early binding sockets to a smap object before the connection
-	 * is established.
-	 */
-	switch (sk->sk_state) {
-	case TCP_SYN_SENT:
-	case TCP_SYN_RECV:
-	case TCP_ESTABLISHED:
-		break;
-	case TCP_CLOSE_WAIT:
-	case TCP_CLOSING:
-	case TCP_LAST_ACK:
-	case TCP_FIN_WAIT1:
-	case TCP_FIN_WAIT2:
-	case TCP_LISTEN:
-		break;
-	case TCP_CLOSE:
-		/* Only release if the map entry is in fact the sock in
-		 * question. There is a case where the operator deletes
-		 * the sock from the map, but the TCP sock is closed before
-		 * the psock is detached. Use cmpxchg to verify correct
-		 * sock is removed.
-		 */
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		write_lock_bh(&sk->sk_callback_lock);
-		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-			osk = cmpxchg(e->entry, sk, NULL);
-			if (osk == sk) {
-				list_del(&e->list);
-				smap_release_sock(psock, sk);
-			}
-		}
-		write_unlock_bh(&sk->sk_callback_lock);
-		break;
-	default:
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		smap_report_sk_error(psock, EPIPE);
-		break;
-	}
-
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_all(&wq->wait);
-	rcu_read_unlock();
-}
-
 static void smap_read_sock_strparser(struct strparser *strp,
 				     struct sk_buff *skb)
 {
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
-	sk->sk_state_change = psock->save_state_change;
 	psock->save_data_ready = NULL;
 	psock->save_write_space = NULL;
-	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
 }
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 	if (psock->refcnt)
 		return;
 
+	tcp_cleanup_ulp(sock);
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
 	rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	psock->save_data_ready = sk->sk_data_ready;
 	psock->save_write_space = sk->sk_write_space;
-	psock->save_state_change = sk->sk_state_change;
 	sk->sk_data_ready = smap_data_ready;
 	sk->sk_write_space = smap_write_space;
-	sk->sk_state_change = smap_state_change;
 	psock->strp_enabled = true;
 }
 
@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
 	stab = kzalloc(sizeof(*stab), GFP_USER);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -754,6 +784,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 
+		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+		if (err)
+			goto out_progs;
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}
 

From 3d9e952697de89b53227f06d4241f275eb99cfc4 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 5 Feb 2018 10:17:54 -0800
Subject: [PATCH 318/676] bpf: sockmap, fix leaking maps with attached but not
 detached progs

When a program is attached to a map we increment the program refcnt
to ensure that the program is not removed while it is potentially
being referenced from sockmap side. However, if this same program
also references the map (this is a reasonably common pattern in
my programs) then the verifier will also increment the maps refcnt
from the verifier. This is to ensure the map doesn't get garbage
collected while the program has a reference to it.

So we are left in a state where the map holds the refcnt on the
program stopping it from being removed and releasing the map refcnt.
And vice versa the program holds a refcnt on the map stopping it
from releasing the refcnt on the prog.

All this is fine as long as users detach the program while the
map fd is still around. But, if the user omits this detach command
we are left with a dangling map we can no longer release.

To resolve this when the map fd is released decrement the program
references and remove any reference from the map to the program.
This fixes the issue with possibly dangling map and creates a
user side API constraint. That is, the map fd must be held open
for programs to be attached to a map.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index bd4a6d9c67095..48c33417d13c0 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -620,11 +620,6 @@ static void sock_map_free(struct bpf_map *map)
 	}
 	rcu_read_unlock();
 
-	if (stab->bpf_verdict)
-		bpf_prog_put(stab->bpf_verdict);
-	if (stab->bpf_parse)
-		bpf_prog_put(stab->bpf_parse);
-
 	sock_map_remove_complete(stab);
 }
 
@@ -900,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
+static void sock_map_release(struct bpf_map *map, struct file *map_file)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct bpf_prog *orig;
+
+	orig = xchg(&stab->bpf_parse, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+	orig = xchg(&stab->bpf_verdict, NULL);
+	if (orig)
+		bpf_prog_put(orig);
+}
+
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc = sock_map_alloc,
 	.map_free = sock_map_free,
@@ -907,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
+	.map_release = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,

From 3ac6d8c787b835b997eb23e43e09aa0895ef7d58 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 5 Feb 2018 17:18:11 -0800
Subject: [PATCH 319/676] x86/entry/64: Clear registers for
 exceptions/interrupts, to reduce speculation attack surface

Clear the 'extra' registers on entering the 64-bit kernel for exceptions
and interrupts. The common registers are not cleared since they are
likely clobbered well before they can be exploited in a speculative
execution attack.

Originally-From: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/151787989146.7847.15749181712358213254.stgit@dwillia2-desk3.amr.corp.intel.com
[ Made small improvements to the changelog and the code comments. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 19 +++++++++++++++++++
 arch/x86/entry/entry_64.S |  6 +++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3f48f695d5e6a..f4b129d4af42f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -147,6 +147,25 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
+	/*
+	 * Sanitize registers of values that a speculation attack
+	 * might otherwise want to exploit. The lower registers are
+	 * likely clobbered well before they could be put to use in
+	 * a speculative execution gadget:
+	 */
+	.macro CLEAR_REGS_NOSPEC
+	xorl %ebp, %ebp
+	xorl %ebx, %ebx
+	xorq %r8, %r8
+	xorq %r9, %r9
+	xorq %r10, %r10
+	xorq %r11, %r11
+	xorq %r12, %r12
+	xorq %r13, %r13
+	xorq %r14, %r14
+	xorq %r15, %r15
+	.endm
+
 	.macro POP_EXTRA_REGS
 	popq %r15
 	popq %r14
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 065a71b908080..9e48002b953b1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -575,6 +575,7 @@ END(irq_entries_start)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1133,6 +1134,7 @@ ENTRY(xen_failsafe_callback)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
@@ -1178,6 +1180,7 @@ ENTRY(paranoid_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1230,8 +1233,8 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
-	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 
@@ -1428,6 +1431,7 @@ ENTRY(nmi)
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
 	UNWIND_HINT_REGS
+	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
 	/*

From 6b8cf5cc9965673951f1ab3f0e3cf23d06e3e2ee Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 5 Feb 2018 17:18:17 -0800
Subject: [PATCH 320/676] x86/entry/64/compat: Clear registers for compat
 syscalls, to reduce speculation attack surface

At entry userspace may have populated registers with values that could
otherwise be useful in a speculative execution attack. Clear them to
minimize the kernel's attack surface.

Originally-From: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/151787989697.7847.4083702787288600552.stgit@dwillia2-desk3.amr.corp.intel.com
[ Made small improvements to the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64_compat.S | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 98d5358e4041a..fd65e016e4133 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -85,15 +85,25 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
+	xorq	%r8, %r8		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
+	xorq	%r9, %r9		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
+	xorq	%r10, %r10		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
+	xorq	%r11, %r11		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
+	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
+	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   $0			/* pt_regs->r12 = 0 */
+	xorq	%r12, %r12		/* nospec   r12 */
 	pushq   $0			/* pt_regs->r13 = 0 */
+	xorq	%r13, %r13		/* nospec   r13 */
 	pushq   $0			/* pt_regs->r14 = 0 */
+	xorq	%r14, %r14		/* nospec   r14 */
 	pushq   $0			/* pt_regs->r15 = 0 */
+	xorq	%r15, %r15		/* nospec   r15 */
 	cld
 
 	/*
@@ -214,15 +224,25 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
+	xorq	%r8, %r8		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
+	xorq	%r9, %r9		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
+	xorq	%r10, %r10		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
+	xorq	%r11, %r11		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
+	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
+	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   $0			/* pt_regs->r12 = 0 */
+	xorq	%r12, %r12		/* nospec   r12 */
 	pushq   $0			/* pt_regs->r13 = 0 */
+	xorq	%r13, %r13		/* nospec   r13 */
 	pushq   $0			/* pt_regs->r14 = 0 */
+	xorq	%r14, %r14		/* nospec   r14 */
 	pushq   $0			/* pt_regs->r15 = 0 */
+	xorq	%r15, %r15		/* nospec   r15 */
 
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
@@ -338,15 +358,25 @@ ENTRY(entry_INT80_compat)
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
+	xorq	%r8, %r8		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
+	xorq	%r9, %r9		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
+	xorq	%r10, %r10		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
+	xorq	%r11, %r11		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
+	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp */
+	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   %r12                    /* pt_regs->r12 */
+	xorq	%r12, %r12		/* nospec   r12 */
 	pushq   %r13                    /* pt_regs->r13 */
+	xorq	%r13, %r13		/* nospec   r13 */
 	pushq   %r14                    /* pt_regs->r14 */
+	xorq	%r14, %r14		/* nospec   r14 */
 	pushq   %r15                    /* pt_regs->r15 */
+	xorq	%r15, %r15		/* nospec   r15 */
 	cld
 
 	/*

From 761191258839b7d922b83064c4251b8bd4e2e7c2 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <sstabellini@kernel.org>
Date: Fri, 2 Feb 2018 17:34:09 -0800
Subject: [PATCH 321/676] pvcalls-back: do not return error on inet_accept
 EAGAIN

When the client sends a regular blocking accept request, the backend is
expected to return only when the accept is completed, simulating a
blocking behavior, or return an error.

Specifically, on EAGAIN from inet_accept, the backend shouldn't return
"EAGAIN" to the client. Instead, it should simply continue the wait.
Otherwise, the client will send another accept request, which will cause
another EAGAIN to be sent back, which is a waste of resources and not
conforming to the expected behavior. Change the behavior by turning the
"goto error" into a return.

Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/pvcalls-back.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index c7822d8078b92..156e5aea36db9 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -548,7 +548,7 @@ static void __pvcalls_back_accept(struct work_struct *work)
 	ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true);
 	if (ret == -EAGAIN) {
 		sock_release(sock);
-		goto out_error;
+		return;
 	}
 
 	map = pvcalls_new_active_socket(fedata,

From 36904703aeeeb6cd31993f1353c8325006229f9a Mon Sep 17 00:00:00 2001
From: Kai Heng Feng <kai.heng.feng@canonical.com>
Date: Mon, 5 Feb 2018 13:19:24 +0800
Subject: [PATCH 322/676] ACPI / bus: Parse tables as term_list for Dell XPS
 9570 and Precision M5530

The i2c touchpad on Dell XPS 9570 and Precision M5530 doesn't work out
of box.

The touchpad relies on its _INI method to update its _HID value from
XXXX0000 to SYNA2393.

Also, the _STA relies on value of I2CN to report correct status.

Set acpi_gbl_parse_table_as_term_list so the value of I2CN can be
correctly set up, and _INI can get run. The ACPI table in this machine
is designed to get parsed this way.

Also, change the quirk table to a more generic name.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=198515
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 433bb5a3f3277..5c00e5e18da8c 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -66,10 +66,37 @@ static int set_copy_dsdt(const struct dmi_system_id *id)
 	return 0;
 }
 #endif
+static int set_gbl_term_list(const struct dmi_system_id *id)
+{
+	acpi_gbl_parse_table_as_term_list = 1;
+	return 0;
+}
 
-static const struct dmi_system_id dsdt_dmi_table[] __initconst = {
+static const struct dmi_system_id acpi_quirks_dmi_table[] __initconst = {
+	/*
+	 * Touchpad on Dell XPS 9570/Precision M5530 doesn't work under I2C
+	 * mode.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=198515
+	 */
+	{
+		.callback = set_gbl_term_list,
+		.ident = "Dell Precision M5530",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Precision M5530"),
+		},
+	},
+	{
+		.callback = set_gbl_term_list,
+		.ident = "Dell XPS 15 9570",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "XPS 15 9570"),
+		},
+	},
 	/*
 	 * Invoke DSDT corruption work-around on all Toshiba Satellite.
+	 * DSDT will be copied to memory.
 	 * https://bugzilla.kernel.org/show_bug.cgi?id=14679
 	 */
 	{
@@ -83,7 +110,7 @@ static const struct dmi_system_id dsdt_dmi_table[] __initconst = {
 	{}
 };
 #else
-static const struct dmi_system_id dsdt_dmi_table[] __initconst = {
+static const struct dmi_system_id acpi_quirks_dmi_table[] __initconst = {
 	{}
 };
 #endif
@@ -1008,11 +1035,8 @@ void __init acpi_early_init(void)
 
 	acpi_permanent_mmap = true;
 
-	/*
-	 * If the machine falls into the DMI check table,
-	 * DSDT will be copied to memory
-	 */
-	dmi_check_system(dsdt_dmi_table);
+	/* Check machine-specific quirks */
+	dmi_check_system(acpi_quirks_dmi_table);
 
 	status = acpi_reallocate_root_table();
 	if (ACPI_FAILURE(status)) {

From f859422075165e32c00c8d75d63f300015cc07ae Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 6 Feb 2018 18:55:12 +0100
Subject: [PATCH 323/676] x86: PM: Make APM idle driver initialize polling
 state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the APM driver overlooked by commit 1b39e3f813b4 (cpuidle: Make
drivers initialize polling state) to initialize the polling state like
the other cpuidle drivers modified by that commit to prevent cpuidle
from crashing.

Fixes: 1b39e3f813b4 (cpuidle: Make drivers initialize polling state)
Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/apm_32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index e4b0d92b3ae06..2a7fd56e67b36 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2389,6 +2389,7 @@ static int __init apm_init(void)
 	if (HZ != 100)
 		idle_period = (idle_period * HZ) / 100;
 	if (idle_threshold < 100) {
+		cpuidle_poll_state_init(&apm_idle_driver);
 		if (!cpuidle_register_driver(&apm_idle_driver))
 			if (cpuidle_register_device(&apm_cpuidle_device))
 				cpuidle_unregister_driver(&apm_idle_driver);

From 5235553d821433e1f4fa720fd025d2c4b7ee9994 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 5 Feb 2018 13:16:56 -0700
Subject: [PATCH 324/676] blk-wbt: account flush requests correctly

Mikulas reported a workload that saw bad performance, and figured
out what it was due to various other types of requests being
accounted as reads. Flush requests, for instance. Due to the
high latency of those, we heavily throttle the writes to keep
the latencies in balance. But they really should be accounted
as writes.

Fix this by checking the exact type of the request. If it's a
read, account as a read, if it's a write or a flush, account
as a write. Any other request we disregard. Previously everything
would have been mistakenly accounted as reads.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-wbt.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index ae8de9780085a..f92fc84b5e2c4 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
 
 static int wbt_data_dir(const struct request *rq)
 {
-	return rq_data_dir(rq);
+	const int op = req_op(rq);
+
+	if (op == REQ_OP_READ)
+		return READ;
+	else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH)
+		return WRITE;
+
+	/* don't account */
+	return -1;
 }
 
 int wbt_init(struct request_queue *q)

From 30abb3a67f4b2aa160feeb3c0b771f730cbcca67 Mon Sep 17 00:00:00 2001
From: Howard McLauchlan <hmclauchlan@fb.com>
Date: Tue, 6 Feb 2018 14:05:39 -0800
Subject: [PATCH 325/676] block: Add should_fail_bio() for bpf error injection

The classic error injection mechanism, should_fail_request() does not
support use cases where more information is required (from the entire
struct bio, for example).

To that end, this patch introduces should_fail_bio(), which calls
should_fail_request() under the hood but provides a convenient
place for kprobes to hook into if they require the entire struct bio.
This patch also replaces some existing calls to should_fail_request()
with should_fail_bio() with no degradation in performance.

Signed-off-by: Howard McLauchlan <hmclauchlan@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d0d104268f1a9..2d1a7bbe06343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
 #include <linux/debugfs.h>
+#include <linux/bpf.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -2083,6 +2084,14 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
 	return false;
 }
 
+static noinline int should_fail_bio(struct bio *bio)
+{
+	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+		return -EIO;
+	return 0;
+}
+ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -2174,7 +2183,7 @@ generic_make_request_checks(struct bio *bio)
 	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
 		goto not_supported;
 
-	if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+	if (should_fail_bio(bio))
 		goto end_io;
 
 	if (!bio->bi_partno) {

From 7dc68e98757a8eccf8ca7a53a29b896f1eef1f76 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 5 Feb 2018 14:41:45 -0800
Subject: [PATCH 326/676] netfilter: xt_RATEEST: acquire xt_rateest_mutex for
 hash insert

rateest_hash is supposed to be protected by xt_rateest_mutex,
and, as suggested by Eric, lookup and insert should be atomic,
so we should acquire the xt_rateest_mutex once for both.

So introduce a non-locking helper for internal use and keep the
locking one for external.

Reported-by: <syzbot+5cb189720978275e4c75@syzkaller.appspotmail.com>
Fixes: 5859034d7eb8 ("[NETFILTER]: x_tables: add RATEEST target")
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_RATEEST.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 498b54fd04d7f..141c295191f65 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -39,23 +39,31 @@ static void xt_rateest_hash_insert(struct xt_rateest *est)
 	hlist_add_head(&est->list, &rateest_hash[h]);
 }
 
-struct xt_rateest *xt_rateest_lookup(const char *name)
+static struct xt_rateest *__xt_rateest_lookup(const char *name)
 {
 	struct xt_rateest *est;
 	unsigned int h;
 
 	h = xt_rateest_hash(name);
-	mutex_lock(&xt_rateest_mutex);
 	hlist_for_each_entry(est, &rateest_hash[h], list) {
 		if (strcmp(est->name, name) == 0) {
 			est->refcnt++;
-			mutex_unlock(&xt_rateest_mutex);
 			return est;
 		}
 	}
-	mutex_unlock(&xt_rateest_mutex);
+
 	return NULL;
 }
+
+struct xt_rateest *xt_rateest_lookup(const char *name)
+{
+	struct xt_rateest *est;
+
+	mutex_lock(&xt_rateest_mutex);
+	est = __xt_rateest_lookup(name);
+	mutex_unlock(&xt_rateest_mutex);
+	return est;
+}
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
 void xt_rateest_put(struct xt_rateest *est)
@@ -100,8 +108,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 
 	net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
-	est = xt_rateest_lookup(info->name);
+	mutex_lock(&xt_rateest_mutex);
+	est = __xt_rateest_lookup(info->name);
 	if (est) {
+		mutex_unlock(&xt_rateest_mutex);
 		/*
 		 * If estimator parameters are specified, they must match the
 		 * existing estimator.
@@ -139,11 +149,13 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 
 	info->est = est;
 	xt_rateest_hash_insert(est);
+	mutex_unlock(&xt_rateest_mutex);
 	return 0;
 
 err2:
 	kfree(est);
 err1:
+	mutex_unlock(&xt_rateest_mutex);
 	return ret;
 }
 

From c0ea1bcb39352b57ac5c4b6da8acd65bddeee2c5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Feb 2018 13:22:44 +0100
Subject: [PATCH 327/676] netfilter: nft_flow_offload: move flowtable cleanup
 routines to nf_flow_table

Move the flowtable cleanup routines to nf_flow_table and expose the
nf_flow_table_cleanup() helper function.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  3 +++
 net/netfilter/nf_flow_table.c         | 24 ++++++++++++++++++++++++
 net/netfilter/nft_flow_offload.c      | 19 +------------------
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index b22b22082733c..ed49cd169ecfb 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -95,6 +95,9 @@ struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_t
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 			  void (*iter)(struct flow_offload *flow, void *data),
 			  void *data);
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+
 void nf_flow_offload_work_gc(struct work_struct *work);
 extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index 2f5099cb85b85..04c08f6b9015a 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,7 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
@@ -425,5 +426,28 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
 
+static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+{
+	struct net_device *dev = data;
+
+	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+		return;
+
+	flow_offload_dead(flow);
+}
+
+static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
+					  void *data)
+{
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	flush_delayed_work(&flowtable->gc_work);
+}
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+{
+	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index e5c45c7ac02a1..b65829b2be228 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -194,23 +194,6 @@ static struct nft_expr_type nft_flow_offload_type __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
-{
-	struct net_device *dev = data;
-
-	if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
-		return;
-
-	flow_offload_dead(flow);
-}
-
-static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
-					     void *data)
-{
-	nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
-	flush_delayed_work(&flowtable->gc_work);
-}
-
 static int flow_offload_netdev_event(struct notifier_block *this,
 				     unsigned long event, void *ptr)
 {
@@ -219,7 +202,7 @@ static int flow_offload_netdev_event(struct notifier_block *this,
 	if (event != NETDEV_DOWN)
 		return NOTIFY_DONE;
 
-	nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+	nf_flow_table_cleanup(dev_net(dev), dev);
 
 	return NOTIFY_DONE;
 }

From b408c5b04f82fe4e20bceb8e4f219453d4f21f02 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 6 Feb 2018 13:22:47 +0100
Subject: [PATCH 328/676] netfilter: nf_tables: fix flowtable free

Every flow_offload entry is added into the table twice. Because of this,
rhashtable_free_and_destroy can't be used, since it would call kfree for
each flow_offload object twice.

This patch cleans up the flowtable via nf_flow_table_iterate() to
schedule removal of entries by setting on the dying bit, then there is
an explicitly invocation of the garbage collector to release resources.

Based on patch from Felix Fietkau.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c |  1 +
 net/ipv6/netfilter/nf_flow_table_ipv6.c |  1 +
 net/netfilter/nf_flow_table.c           | 25 +++++++++++++++++++------
 net/netfilter/nf_flow_table_inet.c      |  1 +
 net/netfilter/nf_tables_api.c           |  9 ++-------
 6 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ed49cd169ecfb..020ae903066f5 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,6 +14,7 @@ struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
 	void				(*gc)(struct work_struct *work);
+	void				(*free)(struct nf_flowtable *ft);
 	const struct rhashtable_params	*params;
 	nf_hookfn			*hook;
 	struct module			*owner;
@@ -98,6 +99,7 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 
+void nf_flow_table_free(struct nf_flowtable *flow_table);
 void nf_flow_offload_work_gc(struct work_struct *work);
 extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index b2d01eb25f2cb..25d2975da156f 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -260,6 +260,7 @@ static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index fff21602875ad..d346705d6ee6b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -253,6 +253,7 @@ static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index 04c08f6b9015a..c17f1af42daa5 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -232,19 +232,16 @@ static inline bool nf_flow_is_dying(const struct flow_offload *flow)
 	return flow->flags & FLOW_OFFLOAD_DYING;
 }
 
-void nf_flow_offload_work_gc(struct work_struct *work)
+static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 {
 	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table;
 	struct rhashtable_iter hti;
 	struct flow_offload *flow;
 	int err;
 
-	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
-
 	err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
 	if (err)
-		goto schedule;
+		return 0;
 
 	rhashtable_walk_start(&hti);
 
@@ -270,7 +267,16 @@ void nf_flow_offload_work_gc(struct work_struct *work)
 out:
 	rhashtable_walk_stop(&hti);
 	rhashtable_walk_exit(&hti);
-schedule:
+
+	return 1;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+	struct nf_flowtable *flow_table;
+
+	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+	nf_flow_offload_gc_step(flow_table);
 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
@@ -449,5 +455,12 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
+void nf_flow_table_free(struct nf_flowtable *flow_table)
+{
+	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_free);
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 281209aeba8fd..375a1881d93de 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -24,6 +24,7 @@ static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
 	.params		= &nf_flow_offload_rhash_params,
 	.gc		= nf_flow_offload_work_gc,
+	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 07dd1fac78a88..8b9fe30de0cdd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5399,17 +5399,12 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 	nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
 }
 
-static void nft_flowtable_destroy(void *ptr, void *arg)
-{
-	kfree(ptr);
-}
-
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
 	cancel_delayed_work_sync(&flowtable->data.gc_work);
 	kfree(flowtable->name);
-	rhashtable_free_and_destroy(&flowtable->data.rhashtable,
-				    nft_flowtable_destroy, NULL);
+	flowtable->data.type->free(&flowtable->data);
+	rhashtable_destroy(&flowtable->data.rhashtable);
 	module_put(flowtable->data.type->owner);
 }
 

From 0a5f41767444cc3b4fc5573921ab914b4f78baaa Mon Sep 17 00:00:00 2001
From: Sodagudi Prasad <psodagud@codeaurora.org>
Date: Tue, 6 Feb 2018 15:46:51 -0800
Subject: [PATCH 329/676] kbuild: clang: disable unused variable warnings only
 when constant

Currently, GCC disables -Wunused-const-variable, but not
-Wunused-variable, so warns unused variables if they are
non-constant.

While, Clang does not warn unused variables at all regardless of
the const qualifier because -Wno-unused-const-variable is implied
by the stronger option -Wno-unused-variable.

Disable -Wunused-const-variable instead of -Wunused-variable so that
GCC and Clang work in the same way.

Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 11aff0f9e105f..ca0a57198ca60 100644
--- a/Makefile
+++ b/Makefile
@@ -700,7 +700,6 @@ KBUILD_CFLAGS += $(stackp-flag)
 
 ifeq ($(cc-name),clang)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
 KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
@@ -718,9 +717,9 @@ else
 # These warnings generated too much noise in a regular build.
 # Use make W=1 to enable them (see scripts/Makefile.extrawarn)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 endif
 
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
 else

From 3f2f7c553d077be6a30cb96b2976a2c940bf5335 Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@canonical.com>
Date: Mon, 29 Jan 2018 14:23:15 +0800
Subject: [PATCH 330/676] ALSA: hda - Fix headset mic detection problem for two
 Dell machines

One of them has the codec of alc256 and the other one has the codec
of alc289.

Cc: <stable@vger.kernel.org>
Signed-off-by: Hui Wang <hui.wang@canonical.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 23475888192b5..b791efe07fc03 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6734,6 +6734,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
 		{0x12, 0xb7a60130},
 		{0x14, 0x90170110},
 		{0x21, 0x02211020}),
+	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+		{0x12, 0x90a60130},
+		{0x14, 0x90170110},
+		{0x14, 0x01011020},
+		{0x21, 0x0221101f}),
 	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
 		ALC256_STANDARD_PINS),
 	SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC256_FIXUP_ASUS_MIC,
@@ -6803,6 +6808,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
 		{0x12, 0x90a60120},
 		{0x14, 0x90170110},
 		{0x21, 0x0321101f}),
+	SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE,
+		{0x12, 0xb7a60130},
+		{0x14, 0x90170110},
+		{0x21, 0x04211020}),
 	SND_HDA_PIN_QUIRK(0x10ec0290, 0x103c, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1,
 		ALC290_STANDARD_PINS,
 		{0x15, 0x04211040},

From 40e2c4e5a7efcd50983aacbddd3c617e776018bf Mon Sep 17 00:00:00 2001
From: Kailang Yang <kailang@realtek.com>
Date: Fri, 2 Feb 2018 15:13:09 +0800
Subject: [PATCH 331/676] ALSA: hda/realtek - Add headset mode support for Dell
 laptop

This platform had two Dmic and single Dmic.
This update was for single Dmic.

This commit was for two Dmic.

Fixes: 75ee94b20b46 ("ALSA: hda - fix headset mic problem for Dell machines...")
Signed-off-by: Kailang Yang <kailang@realtek.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index b791efe07fc03..0c61751caa0a2 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -6321,6 +6321,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME),
 	SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER),
 	SND_PCI_QUIRK(0x1028, 0x082a, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE),
+	SND_PCI_QUIRK(0x1028, 0x084b, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB),
+	SND_PCI_QUIRK(0x1028, 0x084e, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB),
 	SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE),
 	SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2),

From 61fcf8ece9b6b09450250c4ca40cc3b81a96a68d Mon Sep 17 00:00:00 2001
From: Kailang Yang <kailang@realtek.com>
Date: Fri, 2 Feb 2018 15:26:46 +0800
Subject: [PATCH 332/676] ALSA: hda/realtek - Enable Thinkpad Dock device for
 ALC298 platform

Thinkpad Dock device support for ALC298 platform.
It need to use SSID for the quirk table.
Because IdeaPad also has ALC298 platform.
Use verb for the quirk table will confuse.

Signed-off-by: Kailang Yang <kailang@realtek.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 42 +++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 0c61751caa0a2..32938ca8e5e36 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -4972,6 +4972,28 @@ static void alc_fixup_tpt440_dock(struct hda_codec *codec,
 	}
 }
 
+static void alc_fixup_tpt470_dock(struct hda_codec *codec,
+				  const struct hda_fixup *fix, int action)
+{
+	static const struct hda_pintbl pincfgs[] = {
+		{ 0x17, 0x21211010 }, /* dock headphone */
+		{ 0x19, 0x21a11010 }, /* dock mic */
+		{ }
+	};
+	struct alc_spec *spec = codec->spec;
+
+	if (action == HDA_FIXUP_ACT_PRE_PROBE) {
+		spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP;
+		/* Enable DOCK device */
+		snd_hda_codec_write(codec, 0x17, 0,
+			    AC_VERB_SET_CONFIG_DEFAULT_BYTES_3, 0);
+		/* Enable DOCK device */
+		snd_hda_codec_write(codec, 0x19, 0,
+			    AC_VERB_SET_CONFIG_DEFAULT_BYTES_3, 0);
+		snd_hda_apply_pincfgs(codec, pincfgs);
+	}
+}
+
 static void alc_shutup_dell_xps13(struct hda_codec *codec)
 {
 	struct alc_spec *spec = codec->spec;
@@ -5446,6 +5468,7 @@ enum {
 	ALC700_FIXUP_INTEL_REFERENCE,
 	ALC274_FIXUP_DELL_BIND_DACS,
 	ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
+	ALC298_FIXUP_TPT470_DOCK,
 };
 
 static const struct hda_fixup alc269_fixups[] = {
@@ -6271,6 +6294,12 @@ static const struct hda_fixup alc269_fixups[] = {
 		.chained = true,
 		.chain_id = ALC274_FIXUP_DELL_BIND_DACS
 	},
+	[ALC298_FIXUP_TPT470_DOCK] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = alc_fixup_tpt470_dock,
+		.chained = true,
+		.chain_id = ALC293_FIXUP_LENOVO_SPK_NOISE
+	},
 };
 
 static const struct snd_pci_quirk alc269_fixup_tbl[] = {
@@ -6452,8 +6481,16 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x2218, "Thinkpad X1 Carbon 2nd", ALC292_FIXUP_TPT440_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x2223, "ThinkPad T550", ALC292_FIXUP_TPT440_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x2226, "ThinkPad X250", ALC292_FIXUP_TPT440_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x222d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x222e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x2231, "Thinkpad T560", ALC292_FIXUP_TPT460),
 	SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC292_FIXUP_TPT460),
+	SND_PCI_QUIRK(0x17aa, 0x2245, "Thinkpad T470", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x2246, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x2247, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x224b, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x224c, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x224d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
 	SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
@@ -6474,7 +6511,12 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x17aa, 0x5050, "Thinkpad T560p", ALC292_FIXUP_TPT460),
 	SND_PCI_QUIRK(0x17aa, 0x5051, "Thinkpad L460", ALC292_FIXUP_TPT460),
 	SND_PCI_QUIRK(0x17aa, 0x5053, "Thinkpad T460", ALC292_FIXUP_TPT460),
+	SND_PCI_QUIRK(0x17aa, 0x505d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x505f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x5062, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x5109, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST),
+	SND_PCI_QUIRK(0x17aa, 0x511e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
+	SND_PCI_QUIRK(0x17aa, 0x511f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK),
 	SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K),
 	SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD),
 	SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */

From 890674343bb2d546825f713542b1e199f0728732 Mon Sep 17 00:00:00 2001
From: Shunyong Yang <shunyong.yang@hxt-semitech.com>
Date: Tue, 6 Feb 2018 14:37:12 +0800
Subject: [PATCH 333/676] ACPI / tables: Add IORT to injectable table list

Loading IORT table from initrd can be used to fix severe firmware
IORT defects temporarily before platform/BIOS vendor releases an
upgraded BIOS binary.

Moreover, it is very powerful to debug SMMU node/device probe, MSI
allocation, stream id translation and IORT table from firmware.

It is also very useful to enable SMMU and devices behind SMMU before
firmware is ready.

This patch adds ACPI_SIG_IORT to the table, which enables IORT
from initrd to override which from firmware.

Signed-off-by: Yang Shunyong <shunyong.yang@hxt-semitech.com>
Acked-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/tables.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 80ce2a7d224b6..7bcb66ccccf3e 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -456,7 +456,8 @@ static const char * const table_sigs[] = {
 	ACPI_SIG_SLIC, ACPI_SIG_SPCR, ACPI_SIG_SPMI, ACPI_SIG_TCPA,
 	ACPI_SIG_UEFI, ACPI_SIG_WAET, ACPI_SIG_WDAT, ACPI_SIG_WDDT,
 	ACPI_SIG_WDRT, ACPI_SIG_DSDT, ACPI_SIG_FADT, ACPI_SIG_PSDT,
-	ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, NULL };
+	ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_IORT,
+	NULL };
 
 #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header)
 

From b52f451105020677cad61a13b3b67af3fdedb91f Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Tue, 6 Feb 2018 17:36:17 -0600
Subject: [PATCH 334/676] ACPI / CPPC: Use 64-bit arithmetic instead of 32-bit

Add suffix ULL to constant 500 in order to give the compiler complete
information about the proper arithmetic to use. Notice that this
constant is used in a context that expects an expression of type
u64 (64 bits, unsigned).

The expression NUM_RETRIES * cppc_ss->latency at line 578, which at
preprocessing time translates to 500 * cppc_ss->latency is currently
being evaluated using 32-bit arithmetic.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 06ea4749ebd98..0afbb2658cbc0 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -119,7 +119,7 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr);
  * to PCC commands. Keeping it high enough to cover emulators where
  * the processors run painfully slow.
  */
-#define NUM_RETRIES 500
+#define NUM_RETRIES 500ULL
 
 struct cppc_attr {
 	struct attribute attr;

From 59a3b3a8db16621574cbac69f6f1eddb9c60e821 Mon Sep 17 00:00:00 2001
From: Akshu Agrawal <Akshu.Agrawal@amd.com>
Date: Thu, 18 Jan 2018 15:51:30 +0530
Subject: [PATCH 335/676] cpufreq: AMD: Ignore the check for ProcFeedback in
 ST/CZ

In ST/CZ CPUID 8000_0007_EDX[11, ProcFeedbackInterface] is 0,
but the mechanism is still available and can be used.

Signed-off-by: Akshu Agrawal <akshu.agrawal@amd.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd_freq_sensitivity.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
index 042023bbbf621..be926d9a66e57 100644
--- a/drivers/cpufreq/amd_freq_sensitivity.c
+++ b/drivers/cpufreq/amd_freq_sensitivity.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/pci.h>
 #include <linux/percpu-defs.h>
 #include <linux/init.h>
 #include <linux/mod_devicetable.h>
@@ -109,12 +110,18 @@ static unsigned int amd_powersave_bias_target(struct cpufreq_policy *policy,
 static int __init amd_freq_sensitivity_init(void)
 {
 	u64 val;
+	struct pci_dev *pcidev;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
 
-	if (!static_cpu_has(X86_FEATURE_PROC_FEEDBACK))
-		return -ENODEV;
+	pcidev = pci_get_device(PCI_VENDOR_ID_AMD,
+			PCI_DEVICE_ID_AMD_KERNCZ_SMBUS, NULL);
+
+	if (!pcidev) {
+		if (!static_cpu_has(X86_FEATURE_PROC_FEEDBACK))
+			return -ENODEV;
+	}
 
 	if (rdmsrl_safe(MSR_AMD64_FREQ_SENSITIVITY_ACTUAL, &val))
 		return -ENODEV;

From 0231d00082f61cfe03f2b7c27e5356f8cdf0312c Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 18 Jan 2018 10:09:51 -0500
Subject: [PATCH 336/676] ACPI: SPCR: Make SPCR available to x86

SPCR is currently only enabled or ARM64 and x86 can use SPCR to setup
an early console.

General fixes include updating Documentation & Kconfig (for x86),
updating comments, and changing parse_spcr() to acpi_parse_spcr(),
and earlycon_init_is_deferred to earlycon_acpi_spcr_enable to be
more descriptive.

On x86, many systems have a valid SPCR table but the table version is
not 2 so the table version check must be a warning.

On ARM64 when the kernel parameter earlycon is used both the early console
and console are enabled.  On x86, only the earlycon should be enabled by
by default.  Modify acpi_parse_spcr() to allow options for initializing
the early console and console separately.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mark Salter <msalter@redhat.com>
Tested-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../admin-guide/kernel-parameters.txt         |  9 ++++--
 arch/arm64/kernel/acpi.c                      |  4 +--
 arch/x86/kernel/acpi/boot.c                   |  3 ++
 drivers/acpi/Kconfig                          |  7 ++++-
 drivers/acpi/spcr.c                           | 29 ++++++++++---------
 drivers/tty/serial/earlycon.c                 | 15 ++++------
 include/linux/acpi.h                          |  7 +++--
 include/linux/serial_core.h                   |  4 +--
 8 files changed, 44 insertions(+), 34 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1bdcf572c2ac7..aecb1c8ef6752 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -917,9 +917,12 @@
 
 	earlycon=	[KNL] Output early console device and options.
 
-			When used with no options, the early console is
-			determined by the stdout-path property in device
-			tree's chosen node.
+			[ARM64] The early console is determined by the
+			stdout-path property in device tree's chosen node,
+			or determined by the ACPI SPCR table.
+
+			[X86] When used with no options the early console is
+			determined by the ACPI SPCR table.
 
 		cdns,<addr>[,options]
 			Start an early, polled-mode console on a Cadence
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index b3162715ed78d..2aa5609def27a 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -230,10 +230,10 @@ void __init acpi_boot_table_init(void)
 
 done:
 	if (acpi_disabled) {
-		if (earlycon_init_is_deferred)
+		if (earlycon_acpi_spcr_enable)
 			early_init_dt_scan_chosen_stdout();
 	} else {
-		parse_spcr(earlycon_init_is_deferred);
+		acpi_parse_spcr(earlycon_acpi_spcr_enable, true);
 		if (IS_ENABLED(CONFIG_ACPI_BGRT))
 			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 	}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ec3a286163c37..2aa92094b59d4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -36,6 +36,7 @@
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <linux/efi-bgrt.h>
+#include <linux/serial_core.h>
 
 #include <asm/e820/api.h>
 #include <asm/irqdomain.h>
@@ -1625,6 +1626,8 @@ int __init acpi_boot_init(void)
 	if (!acpi_noirq)
 		x86_init.pci.init = pci_acpi_init;
 
+	/* Do not enable ACPI SPCR console by default */
+	acpi_parse_spcr(earlycon_acpi_spcr_enable, false);
 	return 0;
 }
 
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 46505396869e6..ddcb5f40e8ee8 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -79,7 +79,12 @@ config ACPI_DEBUGGER_USER
 endif
 
 config ACPI_SPCR_TABLE
-	bool
+	bool "ACPI Serial Port Console Redirection Support"
+	default y if X86
+	help
+	  Enable support for Serial Port Console Redirection (SPCR) Table.
+	  This table provides information about the configuration of the
+	  earlycon console.
 
 config ACPI_LPIT
 	bool
diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c
index 324b35bfe781d..89e97d21a89ce 100644
--- a/drivers/acpi/spcr.c
+++ b/drivers/acpi/spcr.c
@@ -21,7 +21,7 @@
  * occasionally getting stuck as 1. To avoid the potential for a hang, check
  * TXFE == 0 instead of BUSY == 1. This may not be suitable for all UART
  * implementations, so only do so if an affected platform is detected in
- * parse_spcr().
+ * acpi_parse_spcr().
  */
 bool qdf2400_e44_present;
 EXPORT_SYMBOL(qdf2400_e44_present);
@@ -74,19 +74,21 @@ static bool xgene_8250_erratum_present(struct acpi_table_spcr *tb)
 }
 
 /**
- * parse_spcr() - parse ACPI SPCR table and add preferred console
+ * acpi_parse_spcr() - parse ACPI SPCR table and add preferred console
  *
- * @earlycon: set up earlycon for the console specified by the table
+ * @enable_earlycon: set up earlycon for the console specified by the table
+ * @enable_console: setup the console specified by the table.
  *
  * For the architectures with support for ACPI, CONFIG_ACPI_SPCR_TABLE may be
  * defined to parse ACPI SPCR table.  As a result of the parsing preferred
- * console is registered and if @earlycon is true, earlycon is set up.
+ * console is registered and if @enable_earlycon is true, earlycon is set up.
+ * If @enable_console is true the system console is also configured.
  *
  * When CONFIG_ACPI_SPCR_TABLE is defined, this function should be called
  * from arch initialization code as soon as the DT/ACPI decision is made.
  *
  */
-int __init parse_spcr(bool earlycon)
+int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console)
 {
 	static char opts[64];
 	struct acpi_table_spcr *table;
@@ -105,11 +107,8 @@ int __init parse_spcr(bool earlycon)
 	if (ACPI_FAILURE(status))
 		return -ENOENT;
 
-	if (table->header.revision < 2) {
-		err = -ENOENT;
-		pr_err("wrong table version\n");
-		goto done;
-	}
+	if (table->header.revision < 2)
+		pr_info("SPCR table version %d\n", table->header.revision);
 
 	if (table->serial_port.space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
 		switch (ACPI_ACCESS_BIT_WIDTH((
@@ -185,7 +184,7 @@ int __init parse_spcr(bool earlycon)
 	 */
 	if (qdf2400_erratum_44_present(&table->header)) {
 		qdf2400_e44_present = true;
-		if (earlycon)
+		if (enable_earlycon)
 			uart = "qdf2400_e44";
 	}
 
@@ -205,11 +204,13 @@ int __init parse_spcr(bool earlycon)
 
 	pr_info("console: %s\n", opts);
 
-	if (earlycon)
+	if (enable_earlycon)
 		setup_earlycon(opts);
 
-	err = add_preferred_console(uart, 0, opts + strlen(uart) + 1);
-
+	if (enable_console)
+		err = add_preferred_console(uart, 0, opts + strlen(uart) + 1);
+	else
+		err = 0;
 done:
 	acpi_put_table((struct acpi_table_header *)table);
 	return err;
diff --git a/drivers/tty/serial/earlycon.c b/drivers/tty/serial/earlycon.c
index 4c8b80f1c688c..870e84fb6e39e 100644
--- a/drivers/tty/serial/earlycon.c
+++ b/drivers/tty/serial/earlycon.c
@@ -197,25 +197,20 @@ int __init setup_earlycon(char *buf)
 }
 
 /*
- * When CONFIG_ACPI_SPCR_TABLE is defined, "earlycon" without parameters in
- * command line does not start DT earlycon immediately, instead it defers
- * starting it until DT/ACPI decision is made.  At that time if ACPI is enabled
- * call parse_spcr(), else call early_init_dt_scan_chosen_stdout()
+ * This defers the initialization of the early console until after ACPI has
+ * been initialized.
  */
-bool earlycon_init_is_deferred __initdata;
+bool earlycon_acpi_spcr_enable __initdata;
 
 /* early_param wrapper for setup_earlycon() */
 static int __init param_setup_earlycon(char *buf)
 {
 	int err;
 
-	/*
-	 * Just 'earlycon' is a valid param for devicetree earlycons;
-	 * don't generate a warning from parse_early_params() in that case
-	 */
+	/* Just 'earlycon' is a valid param for devicetree and ACPI SPCR. */
 	if (!buf || !buf[0]) {
 		if (IS_ENABLED(CONFIG_ACPI_SPCR_TABLE)) {
-			earlycon_init_is_deferred = true;
+			earlycon_acpi_spcr_enable = true;
 			return 0;
 		} else if (!buf) {
 			return early_init_dt_scan_chosen_stdout();
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b8f4c3c776e5d..e6b98a32495f3 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1249,9 +1249,12 @@ static inline bool acpi_has_watchdog(void) { return false; }
 
 #ifdef CONFIG_ACPI_SPCR_TABLE
 extern bool qdf2400_e44_present;
-int parse_spcr(bool earlycon);
+int acpi_parse_spcr(bool enable_earlycon, bool enable_console);
 #else
-static inline int parse_spcr(bool earlycon) { return 0; }
+static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
+{
+	return 0;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 37b044e78333a..aefd0e5115dae 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -376,10 +376,10 @@ extern int of_setup_earlycon(const struct earlycon_id *match,
 			     const char *options);
 
 #ifdef CONFIG_SERIAL_EARLYCON
-extern bool earlycon_init_is_deferred __initdata;
+extern bool earlycon_acpi_spcr_enable __initdata;
 int setup_earlycon(char *buf);
 #else
-static const bool earlycon_init_is_deferred;
+static const bool earlycon_acpi_spcr_enable;
 static inline int setup_earlycon(char *buf) { return 0; }
 #endif
 

From 3d70671667f15f4bae260541cd91330ef20d2f2e Mon Sep 17 00:00:00 2001
From: Corentin LABBE <clabbe.montjoie@gmail.com>
Date: Thu, 18 Jan 2018 21:02:02 +0100
Subject: [PATCH 337/676] cpufreq: remove at32ap-cpufreq

Since AVR32 arch was removed, at32ap-cpufreq is useless.
Remove this driver.

Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig          |  10 ---
 drivers/cpufreq/Makefile         |   1 -
 drivers/cpufreq/at32ap-cpufreq.c | 127 -------------------------------
 3 files changed, 138 deletions(-)
 delete mode 100644 drivers/cpufreq/at32ap-cpufreq.c

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index d8addbce40bcc..608af20a34940 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -239,16 +239,6 @@ if PPC32 || PPC64
 source "drivers/cpufreq/Kconfig.powerpc"
 endif
 
-if AVR32
-config AVR32_AT32AP_CPUFREQ
-	bool "CPU frequency driver for AT32AP"
-	depends on PLATFORM_AT32AP
-	default n
-	help
-	  This enables the CPU frequency driver for AT32AP processors.
-	  If in doubt, say N.
-endif
-
 if IA64
 config IA64_ACPI_CPUFREQ
 	tristate "ACPI Processor P-States driver"
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index e07715ce88440..c60c1e141d9d9 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -100,7 +100,6 @@ obj-$(CONFIG_POWERNV_CPUFREQ)		+= powernv-cpufreq.o
 
 ##################################################################################
 # Other platform drivers
-obj-$(CONFIG_AVR32_AT32AP_CPUFREQ)	+= at32ap-cpufreq.o
 obj-$(CONFIG_BFIN_CPU_FREQ)		+= blackfin-cpufreq.o
 obj-$(CONFIG_BMIPS_CPUFREQ)		+= bmips-cpufreq.o
 obj-$(CONFIG_CRIS_MACH_ARTPEC3)		+= cris-artpec3-cpufreq.o
diff --git a/drivers/cpufreq/at32ap-cpufreq.c b/drivers/cpufreq/at32ap-cpufreq.c
deleted file mode 100644
index 7b612c8bb09ea..0000000000000
--- a/drivers/cpufreq/at32ap-cpufreq.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (C) 2004-2007 Atmel Corporation
- *
- * Based on MIPS implementation arch/mips/kernel/time.c
- *   Copyright 2001 MontaVista Software Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-/*#define DEBUG*/
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/io.h>
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-
-static struct cpufreq_frequency_table *freq_table;
-
-static unsigned int	ref_freq;
-static unsigned long	loops_per_jiffy_ref;
-
-static int at32_set_target(struct cpufreq_policy *policy, unsigned int index)
-{
-	unsigned int old_freq, new_freq;
-
-	old_freq = policy->cur;
-	new_freq = freq_table[index].frequency;
-
-	if (!ref_freq) {
-		ref_freq = old_freq;
-		loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
-	}
-
-	if (old_freq < new_freq)
-		boot_cpu_data.loops_per_jiffy = cpufreq_scale(
-				loops_per_jiffy_ref, ref_freq, new_freq);
-	clk_set_rate(policy->clk, new_freq * 1000);
-	if (new_freq < old_freq)
-		boot_cpu_data.loops_per_jiffy = cpufreq_scale(
-				loops_per_jiffy_ref, ref_freq, new_freq);
-
-	return 0;
-}
-
-static int at32_cpufreq_driver_init(struct cpufreq_policy *policy)
-{
-	unsigned int frequency, rate, min_freq;
-	struct clk *cpuclk;
-	int retval, steps, i;
-
-	if (policy->cpu != 0)
-		return -EINVAL;
-
-	cpuclk = clk_get(NULL, "cpu");
-	if (IS_ERR(cpuclk)) {
-		pr_debug("cpufreq: could not get CPU clk\n");
-		retval = PTR_ERR(cpuclk);
-		goto out_err;
-	}
-
-	min_freq = (clk_round_rate(cpuclk, 1) + 500) / 1000;
-	frequency = (clk_round_rate(cpuclk, ~0UL) + 500) / 1000;
-	policy->cpuinfo.transition_latency = 0;
-
-	/*
-	 * AVR32 CPU frequency rate scales in power of two between maximum and
-	 * minimum, also add space for the table end marker.
-	 *
-	 * Further validate that the frequency is usable, and append it to the
-	 * frequency table.
-	 */
-	steps = fls(frequency / min_freq) + 1;
-	freq_table = kzalloc(steps * sizeof(struct cpufreq_frequency_table),
-			GFP_KERNEL);
-	if (!freq_table) {
-		retval = -ENOMEM;
-		goto out_err_put_clk;
-	}
-
-	for (i = 0; i < (steps - 1); i++) {
-		rate = clk_round_rate(cpuclk, frequency * 1000) / 1000;
-
-		if (rate != frequency)
-			freq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-		else
-			freq_table[i].frequency = frequency;
-
-		frequency /= 2;
-	}
-
-	policy->clk = cpuclk;
-	freq_table[steps - 1].frequency = CPUFREQ_TABLE_END;
-
-	retval = cpufreq_table_validate_and_show(policy, freq_table);
-	if (!retval) {
-		printk("cpufreq: AT32AP CPU frequency driver\n");
-		return 0;
-	}
-
-	kfree(freq_table);
-out_err_put_clk:
-	clk_put(cpuclk);
-out_err:
-	return retval;
-}
-
-static struct cpufreq_driver at32_driver = {
-	.name		= "at32ap",
-	.init		= at32_cpufreq_driver_init,
-	.verify		= cpufreq_generic_frequency_table_verify,
-	.target_index	= at32_set_target,
-	.get		= cpufreq_generic_get,
-	.flags		= CPUFREQ_STICKY,
-};
-
-static int __init at32_cpufreq_init(void)
-{
-	return cpufreq_register_driver(&at32_driver);
-}
-late_initcall(at32_cpufreq_init);

From eb85c50c24cf3e96f9aa01fd3e6219ad36845a96 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Mon, 22 Jan 2018 14:41:07 +0000
Subject: [PATCH 338/676] cpufreq: scpi: fix static checker warning cdev isn't
 an ERR_PTR

Commit 343a8d17fa8d (cpufreq: scpi: remove arm_big_little dependency)
leads to the following static checker warning:

	drivers/cpufreq/scpi-cpufreq.c:203 scpi_cpufreq_ready()
	warn: 'cdev' isn't an ERR_PTR

of_cpufreq_cooling_register() returns NULL on error. This patch removes
the incorrect IS_ERR check on the returned pointer.

Fixes: 343a8d17fa8d (cpufreq: scpi: remove arm_big_little dependency)
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scpi-cpufreq.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index 247fcbfa4cb5b..efe32cbefa06b 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -197,11 +197,8 @@ static int scpi_cpufreq_exit(struct cpufreq_policy *policy)
 static void scpi_cpufreq_ready(struct cpufreq_policy *policy)
 {
 	struct scpi_data *priv = policy->driver_data;
-	struct thermal_cooling_device *cdev;
 
-	cdev = of_cpufreq_cooling_register(policy);
-	if (!IS_ERR(cdev))
-		priv->cdev = cdev;
+	priv->cdev = of_cpufreq_cooling_register(policy);
 }
 
 static struct cpufreq_driver scpi_cpufreq_driver = {

From d8ed9600581d40d818ae417b3086a333841b0559 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 7 Feb 2018 11:50:41 +0900
Subject: [PATCH 339/676] netfilter: remove useless prototype

prototype nf_ct_nat_offset is not used anymore.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
---
 include/net/netfilter/nf_conntrack.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index f5223bf2c420a..062dc19b58406 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -213,11 +213,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
 	return nf_ct_delete(ct, 0, 0);
 }
 
-/* These are for NAT.  Icky. */
-extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct,
-			       enum ip_conntrack_dir dir,
-			       u32 seq);
-
 /* Set all unconfirmed conntrack as dying */
 void nf_ct_unconfirmed_destroy(struct net *);
 

From 0ff90b6c20340e57616a51ae1a1bf18156d6638a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 7 Feb 2018 09:49:02 +0100
Subject: [PATCH 340/676] netfilter: nf_flow_offload: fix use-after-free and a
 resource leak

flow_offload_del frees the flow, so all associated resource must be
freed before.

Since the ct entry in struct flow_offload_entry was allocated by
flow_offload_alloc, it should be freed by flow_offload_free to take care
of the error handling path when flow_offload_add fails.

While at it, make flow_offload_del static, since it should never be
called directly, only from the gc step

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 -
 net/netfilter/nf_flow_table.c         | 27 +++++++--------------------
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 020ae903066f5..833752dd0c583 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -90,7 +90,6 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
 void flow_offload_free(struct flow_offload *flow);
 
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
-void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
 struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
 						     struct flow_offload_tuple *tuple);
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index c17f1af42daa5..ec410cae93071 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -125,7 +125,9 @@ void flow_offload_free(struct flow_offload *flow)
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 	dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
 	e = container_of(flow, struct flow_offload_entry, flow);
-	kfree(e);
+	nf_ct_delete(e->ct, 0, 0);
+	nf_ct_put(e->ct);
+	kfree_rcu(e, rcu_head);
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
@@ -149,11 +151,9 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
 
-void flow_offload_del(struct nf_flowtable *flow_table,
-		      struct flow_offload *flow)
+static void flow_offload_del(struct nf_flowtable *flow_table,
+			     struct flow_offload *flow)
 {
-	struct flow_offload_entry *e;
-
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
 			       *flow_table->type->params);
@@ -161,10 +161,8 @@ void flow_offload_del(struct nf_flowtable *flow_table,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
 			       *flow_table->type->params);
 
-	e = container_of(flow, struct flow_offload_entry, flow);
-	kfree_rcu(e, rcu_head);
+	flow_offload_free(flow);
 }
-EXPORT_SYMBOL_GPL(flow_offload_del);
 
 struct flow_offload_tuple_rhash *
 flow_offload_lookup(struct nf_flowtable *flow_table,
@@ -175,15 +173,6 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
-static void nf_flow_release_ct(const struct flow_offload *flow)
-{
-	struct flow_offload_entry *e;
-
-	e = container_of(flow, struct flow_offload_entry, flow);
-	nf_ct_delete(e->ct, 0, 0);
-	nf_ct_put(e->ct);
-}
-
 int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 			  void (*iter)(struct flow_offload *flow, void *data),
 			  void *data)
@@ -259,10 +248,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
 
 		if (nf_flow_has_expired(flow) ||
-		    nf_flow_is_dying(flow)) {
+		    nf_flow_is_dying(flow))
 			flow_offload_del(flow_table, flow);
-			nf_flow_release_ct(flow);
-		}
 	}
 out:
 	rhashtable_walk_stop(&hti);

From a3381e3a65cbaf612c8f584906c4dba27e84267c Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 23 Jan 2018 21:43:08 +0100
Subject: [PATCH 341/676] PM / domains: Fix up domain-idle-states OF parsing

Commit b539cc82d493 (PM / Domains: Ignore domain-idle-states that are
not compatible), made it possible to ignore non-compatible
domain-idle-states OF nodes. However, in case that happens while doing
the OF parsing, the number of elements in the allocated array would
exceed the numbers actually needed, thus wasting memory.

Fix this by pre-iterating the genpd OF node and counting the number of
compatible domain-idle-states nodes, before doing the allocation. While
doing this, it makes sense to rework the code a bit to avoid open coding,
of parts responsible for the OF node iteration.

Let's also take the opportunity to clarify the function header for
of_genpd_parse_idle_states(), about what is being returned in case of
errors.

Fixes: b539cc82d493 (PM / Domains: Ignore domain-idle-states that are not compatible)
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Lina Iyer <ilina@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 76 ++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 528b24149bc70..1ea0e2502e8ed 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2290,6 +2290,38 @@ static int genpd_parse_state(struct genpd_power_state *genpd_state,
 	return 0;
 }
 
+static int genpd_iterate_idle_states(struct device_node *dn,
+				     struct genpd_power_state *states)
+{
+	int ret;
+	struct of_phandle_iterator it;
+	struct device_node *np;
+	int i = 0;
+
+	ret = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
+	if (ret <= 0)
+		return ret;
+
+	/* Loop over the phandles until all the requested entry is found */
+	of_for_each_phandle(&it, ret, dn, "domain-idle-states", NULL, 0) {
+		np = it.node;
+		if (!of_match_node(idle_state_match, np))
+			continue;
+		if (states) {
+			ret = genpd_parse_state(&states[i], np);
+			if (ret) {
+				pr_err("Parsing idle state node %pOF failed with err %d\n",
+				       np, ret);
+				of_node_put(np);
+				return ret;
+			}
+		}
+		i++;
+	}
+
+	return i;
+}
+
 /**
  * of_genpd_parse_idle_states: Return array of idle states for the genpd.
  *
@@ -2299,49 +2331,31 @@ static int genpd_parse_state(struct genpd_power_state *genpd_state,
  *
  * Returns the device states parsed from the OF node. The memory for the states
  * is allocated by this function and is the responsibility of the caller to
- * free the memory after use.
+ * free the memory after use. If no domain idle states is found it returns
+ * -EINVAL and in case of errors, a negative error code.
  */
 int of_genpd_parse_idle_states(struct device_node *dn,
 			struct genpd_power_state **states, int *n)
 {
 	struct genpd_power_state *st;
-	struct device_node *np;
-	int i = 0;
-	int err, ret;
-	int count;
-	struct of_phandle_iterator it;
-	const struct of_device_id *match_id;
+	int ret;
 
-	count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
-	if (count <= 0)
-		return -EINVAL;
+	ret = genpd_iterate_idle_states(dn, NULL);
+	if (ret <= 0)
+		return ret < 0 ? ret : -EINVAL;
 
-	st = kcalloc(count, sizeof(*st), GFP_KERNEL);
+	st = kcalloc(ret, sizeof(*st), GFP_KERNEL);
 	if (!st)
 		return -ENOMEM;
 
-	/* Loop over the phandles until all the requested entry is found */
-	of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
-		np = it.node;
-		match_id = of_match_node(idle_state_match, np);
-		if (!match_id)
-			continue;
-		ret = genpd_parse_state(&st[i++], np);
-		if (ret) {
-			pr_err
-			("Parsing idle state node %pOF failed with err %d\n",
-							np, ret);
-			of_node_put(np);
-			kfree(st);
-			return ret;
-		}
+	ret = genpd_iterate_idle_states(dn, st);
+	if (ret <= 0) {
+		kfree(st);
+		return ret < 0 ? ret : -EINVAL;
 	}
 
-	*n = i;
-	if (!i)
-		kfree(st);
-	else
-		*states = st;
+	*states = st;
+	*n = ret;
 
 	return 0;
 }

From 168b6511e8e09de45bed272aaf76ef4d4ca3c05d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Feb 2018 15:56:18 +0100
Subject: [PATCH 342/676] x86: hibernate: fix swsusp_arch_resume() prototype

The declaration for swsusp_arch_resume() marks it as 'asmlinkage',
but the definition in x86-32 does not, and it fails to include
the header with the declaration.  This leads to a warning when
building with link-time-optimizations:

kernel/power/power.h:108:23: error: type of 'swsusp_arch_resume' does not match original declaration [-Werror=lto-type-mismatch]
 extern asmlinkage int swsusp_arch_resume(void);
                       ^
arch/x86/power/hibernate_32.c:148:0: note: 'swsusp_arch_resume' was previously declared here
 int swsusp_arch_resume(void)

This moves the declaration into a globally visible header file
and fixes up both x86 definitions to match it.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/power/hibernate_32.c | 2 +-
 arch/x86/power/hibernate_64.c | 2 +-
 include/linux/suspend.h       | 2 ++
 kernel/power/power.h          | 3 ---
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index c35fdb585c685..afc4ed7b15782 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -145,7 +145,7 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
 #endif
 }
 
-int swsusp_arch_resume(void)
+asmlinkage int swsusp_arch_resume(void)
 {
 	int error;
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f910c514438f1..0ef5e52049689 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -174,7 +174,7 @@ static int relocate_restore_code(void)
 	return 0;
 }
 
-int swsusp_arch_resume(void)
+asmlinkage int swsusp_arch_resume(void)
 {
 	int error;
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index cc22a24516d69..440b62f7502eb 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -384,6 +384,8 @@ extern int swsusp_page_is_forbidden(struct page *);
 extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
 extern unsigned long get_safe_page(gfp_t gfp_mask);
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
 
 extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f29cd178df90a..9e58bdc8a562b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -104,9 +104,6 @@ extern int in_suspend;
 extern dev_t swsusp_resume_device;
 extern sector_t swsusp_resume_block;
 
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
 extern int hibernate_preallocate_memory(void);

From c713fb071edc0efc01a955f65a006b0e1795d2eb Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Mon, 5 Feb 2018 12:38:11 -0600
Subject: [PATCH 343/676] rtlwifi: rtl8821ae: Fix connection lost problem
 correctly

There has been a coding error in rtl8821ae since it was first introduced,
namely that an 8-bit register was read using a 16-bit read in
_rtl8821ae_dbi_read(). This error was fixed with commit 40b368af4b75
("rtlwifi: Fix alignment issues"); however, this change led to
instability in the connection. To restore stability, this change
was reverted in commit b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection
lost problem").

Unfortunately, the unaligned access causes machine checks in ARM
architecture, and we were finally forced to find the actual cause of the
problem on x86 platforms. Following a suggestion from Pkshih
<pkshih@realtek.com>, it was found that increasing the ASPM L1
latency from 0 to 7 fixed the instability. This parameter was varied to
see if a smaller value would work; however, it appears that 7 is the
safest value. A new symbol is defined for this quantity, thus it can be
easily changed if necessary.

Fixes: b8b8b16352cd ("rtlwifi: rtl8821ae: Fix connection lost problem")
Cc: Stable <stable@vger.kernel.org> # 4.14+
Fix-suggested-by: Pkshih <pkshih@realtek.com>
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Tested-by: James Cameron <quozl@laptop.org>  # x86_64 OLPC NL3
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 5 +++--
 drivers/net/wireless/realtek/rtlwifi/wifi.h         | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
index f20e77b4bb65f..317c1b3101dad 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c
@@ -1123,7 +1123,7 @@ static u8 _rtl8821ae_dbi_read(struct rtl_priv *rtlpriv, u16 addr)
 	}
 	if (0 == tmp) {
 		read_addr = REG_DBI_RDATA + addr % 4;
-		ret = rtl_read_word(rtlpriv, read_addr);
+		ret = rtl_read_byte(rtlpriv, read_addr);
 	}
 	return ret;
 }
@@ -1165,7 +1165,8 @@ static void _rtl8821ae_enable_aspm_back_door(struct ieee80211_hw *hw)
 	}
 
 	tmp = _rtl8821ae_dbi_read(rtlpriv, 0x70f);
-	_rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7));
+	_rtl8821ae_dbi_write(rtlpriv, 0x70f, tmp | BIT(7) |
+			     ASPM_L1_LATENCY << 3);
 
 	tmp = _rtl8821ae_dbi_read(rtlpriv, 0x719);
 	_rtl8821ae_dbi_write(rtlpriv, 0x719, tmp | BIT(3) | BIT(4));
diff --git a/drivers/net/wireless/realtek/rtlwifi/wifi.h b/drivers/net/wireless/realtek/rtlwifi/wifi.h
index a7aacbc3984ec..46dcb7fef1954 100644
--- a/drivers/net/wireless/realtek/rtlwifi/wifi.h
+++ b/drivers/net/wireless/realtek/rtlwifi/wifi.h
@@ -99,6 +99,7 @@
 #define RTL_USB_MAX_RX_COUNT			100
 #define QBSS_LOAD_SIZE				5
 #define MAX_WMMELE_LENGTH			64
+#define ASPM_L1_LATENCY				7
 
 #define TOTAL_CAM_ENTRY				32
 

From 0a7fe718239e2f8d9c067070f59f53b509bb17dc Mon Sep 17 00:00:00 2001
From: Yu Wang <yyuwang@codeaurora.org>
Date: Tue, 30 Jan 2018 14:06:01 +0200
Subject: [PATCH 344/676] ath10k: correct the length of DRAM dump for QCA6174
 hw3.x/QCA9377 hw1.1

The length of DRAM dump for QCA6174 hw3.0/hw3.2 and QCA9377 hw1.1
are less than the actual value, some coredump contents are missed.
To fix it, change the length from 0x90000 to 0xa8000.

Fixes: 703f261dd77f ("ath10k: add memory dump support for QCA6174/QCA9377")
Signed-off-by: Yu Wang <yyuwang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/coredump.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/coredump.c b/drivers/net/wireless/ath/ath10k/coredump.c
index 4dde126dab171..7173b3743b43b 100644
--- a/drivers/net/wireless/ath/ath10k/coredump.c
+++ b/drivers/net/wireless/ath/ath10k/coredump.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -616,7 +617,7 @@ static const struct ath10k_mem_region qca6174_hw30_mem_regions[] = {
 	{
 		.type = ATH10K_MEM_REGION_TYPE_DRAM,
 		.start = 0x400000,
-		.len = 0x90000,
+		.len = 0xa8000,
 		.name = "DRAM",
 		.section_table = {
 			.sections = NULL,

From d5cc61119343b6f1b8716cfe591d4989ea0c5c28 Mon Sep 17 00:00:00 2001
From: Tobias Schramm <tobleminer@gmail.com>
Date: Tue, 30 Jan 2018 14:06:02 +0200
Subject: [PATCH 345/676] PCI: Add Ubiquiti Networks vendor ID

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Tobias Schramm <tobleminer@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index ab20dc5db423c..35871adfdc868 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -149,6 +149,8 @@
 #define PCI_VENDOR_ID_DYNALINK		0x0675
 #define PCI_DEVICE_ID_DYNALINK_IS64PH	0x1702
 
+#define PCI_VENDOR_ID_UBIQUITI		0x0777
+
 #define PCI_VENDOR_ID_BERKOM			0x0871
 #define PCI_DEVICE_ID_BERKOM_A1T		0xffa1
 #define PCI_DEVICE_ID_BERKOM_T_CONCEPT		0xffa2

From 34f1cb339cae5c0b6b75094e2d5c79d19be424ed Mon Sep 17 00:00:00 2001
From: Tobias Schramm <tobleminer@gmail.com>
Date: Tue, 30 Jan 2018 14:06:05 +0200
Subject: [PATCH 346/676] ath10k: add support for Ubiquiti rebranded QCA988X v2

Some modern Ubiquiti devices contain a rebranded QCA988X rev2 with
a custom Ubiquiti vendor and device id. This patch adds support for
those devices, treating them as a QCA988X v2.

Signed-off-by: Tobias Schramm <tobleminer@gmail.com>
[kvalo@codeaurora.org: rebase, add missing fields in hw_params, fix a long line in pci.c:61]
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 29 ++++++++++++++++++++++++++
 drivers/net/wireless/ath/ath10k/hw.h   |  1 +
 drivers/net/wireless/ath/ath10k/pci.c  |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index b0fdc10236193..6fb282f76804c 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -90,6 +90,35 @@ static const struct ath10k_hw_params ath10k_hw_params_list[] = {
 		.target_64bit = false,
 		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
 	},
+	{
+		.id = QCA988X_HW_2_0_VERSION,
+		.dev_id = QCA988X_2_0_DEVICE_ID_UBNT,
+		.name = "qca988x hw2.0 ubiquiti",
+		.patch_load_addr = QCA988X_HW_2_0_PATCH_LOAD_ADDR,
+		.uart_pin = 7,
+		.cc_wraparound_type = ATH10K_HW_CC_WRAP_SHIFTED_ALL,
+		.otp_exe_param = 0,
+		.channel_counters_freq_hz = 88000,
+		.max_probe_resp_desc_thres = 0,
+		.cal_data_len = 2116,
+		.fw = {
+			.dir = QCA988X_HW_2_0_FW_DIR,
+			.board = QCA988X_HW_2_0_BOARD_DATA_FILE,
+			.board_size = QCA988X_BOARD_DATA_SZ,
+			.board_ext_size = QCA988X_BOARD_EXT_DATA_SZ,
+		},
+		.hw_ops = &qca988x_ops,
+		.decap_align_bytes = 4,
+		.spectral_bin_discard = 0,
+		.vht160_mcs_rx_highest = 0,
+		.vht160_mcs_tx_highest = 0,
+		.n_cipher_suites = 8,
+		.num_peers = TARGET_TLV_NUM_PEERS,
+		.ast_skid_limit = 0x10,
+		.num_wds_entries = 0x20,
+		.target_64bit = false,
+		.rx_ring_fill_level = HTT_RX_RING_FILL_LEVEL,
+	},
 	{
 		.id = QCA9887_HW_1_0_VERSION,
 		.dev_id = QCA9887_1_0_DEVICE_ID,
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index 6203bc65799be..413b1b4321f77 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -22,6 +22,7 @@
 
 #define ATH10K_FW_DIR			"ath10k"
 
+#define QCA988X_2_0_DEVICE_ID_UBNT   (0x11ac)
 #define QCA988X_2_0_DEVICE_ID   (0x003c)
 #define QCA6164_2_1_DEVICE_ID   (0x0041)
 #define QCA6174_2_1_DEVICE_ID   (0x003e)
diff --git a/drivers/net/wireless/ath/ath10k/pci.c b/drivers/net/wireless/ath/ath10k/pci.c
index 355db6a0fcf3e..1b266cd0c2ec0 100644
--- a/drivers/net/wireless/ath/ath10k/pci.c
+++ b/drivers/net/wireless/ath/ath10k/pci.c
@@ -58,6 +58,9 @@ MODULE_PARM_DESC(reset_mode, "0: auto, 1: warm only (default: 0)");
 #define ATH10K_DIAG_TRANSFER_LIMIT	0x5000
 
 static const struct pci_device_id ath10k_pci_id_table[] = {
+	/* PCI-E QCA988X V2 (Ubiquiti branded) */
+	{ PCI_VDEVICE(UBIQUITI, QCA988X_2_0_DEVICE_ID_UBNT) },
+
 	{ PCI_VDEVICE(ATHEROS, QCA988X_2_0_DEVICE_ID) }, /* PCI-E QCA988X V2 */
 	{ PCI_VDEVICE(ATHEROS, QCA6164_2_1_DEVICE_ID) }, /* PCI-E QCA6164 V2.1 */
 	{ PCI_VDEVICE(ATHEROS, QCA6174_2_1_DEVICE_ID) }, /* PCI-E QCA6174 V2.1 */
@@ -74,6 +77,7 @@ static const struct ath10k_pci_supp_chip ath10k_pci_supp_chips[] = {
 	 * hacks. ath10k doesn't have them and these devices crash horribly
 	 * because of that.
 	 */
+	{ QCA988X_2_0_DEVICE_ID_UBNT, QCA988X_HW_2_0_CHIP_ID_REV },
 	{ QCA988X_2_0_DEVICE_ID, QCA988X_HW_2_0_CHIP_ID_REV },
 
 	{ QCA6164_2_1_DEVICE_ID, QCA6174_HW_2_1_CHIP_ID_REV },
@@ -2193,6 +2197,7 @@ static int ath10k_pci_get_num_banks(struct ath10k *ar)
 	struct ath10k_pci *ar_pci = ath10k_pci_priv(ar);
 
 	switch (ar_pci->pdev->device) {
+	case QCA988X_2_0_DEVICE_ID_UBNT:
 	case QCA988X_2_0_DEVICE_ID:
 	case QCA99X0_2_0_DEVICE_ID:
 	case QCA9888_2_0_DEVICE_ID:
@@ -3424,6 +3429,7 @@ static int ath10k_pci_probe(struct pci_dev *pdev,
 	u32 (*targ_cpu_to_ce_addr)(struct ath10k *ar, u32 addr);
 
 	switch (pci_dev->device) {
+	case QCA988X_2_0_DEVICE_ID_UBNT:
 	case QCA988X_2_0_DEVICE_ID:
 		hw_rev = ATH10K_HW_QCA988X;
 		pci_ps = false;

From b9607de6cf22a5cd268b9206177a9baafb6e8ac8 Mon Sep 17 00:00:00 2001
From: Wojciech Dubowik <Wojciech.Dubowik@neratec.com>
Date: Tue, 30 Jan 2018 14:06:07 +0200
Subject: [PATCH 347/676] ath9k: Fix get channel default noise floor

Commit 8da58553cc63 ("ath9k: Use calibrated noise floor value
when available") introduced regression in ath9k_hw_getchan_noise
where per chain nominal noise floor has been taken instead default
for channel.
Revert to original default channel noise floor.

Fixes: 8da58553cc63 ("ath9k: Use calibrated noise floor value when available")
Reported-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Signed-off-by: Wojciech Dubowik <Wojciech.Dubowik@neratec.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath9k/calib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath9k/calib.c b/drivers/net/wireless/ath/ath9k/calib.c
index 3d9447e21025f..695c779ae8cf7 100644
--- a/drivers/net/wireless/ath/ath9k/calib.c
+++ b/drivers/net/wireless/ath/ath9k/calib.c
@@ -72,7 +72,7 @@ static s16 ath9k_hw_get_default_nf(struct ath_hw *ah,
 s16 ath9k_hw_getchan_noise(struct ath_hw *ah, struct ath9k_channel *chan,
 			   s16 nf)
 {
-	s8 noise = ath9k_hw_get_default_nf(ah, chan, 0);
+	s8 noise = ATH_DEFAULT_NOISE_FLOOR;
 
 	if (nf) {
 		s8 delta = nf - ATH9K_NF_CAL_NOISE_THRESH -

From 50e79e25250bf928369996277e85b00536b380c7 Mon Sep 17 00:00:00 2001
From: Yu Wang <yyuwang@codeaurora.org>
Date: Tue, 30 Jan 2018 14:06:08 +0200
Subject: [PATCH 348/676] ath10k: fix kernel panic issue during pci probe

If device gone during chip reset, ar->normal_mode_fw.board is not
initialized, but ath10k_debug_print_hwfw_info() will try to access its
member, which will cause 'kernel NULL pointer' issue. This was found
using a faulty device (pci link went down sometimes) in a random
insmod/rmmod/other-op test.
To fix it, check ar->normal_mode_fw.board before accessing the member.

pci 0000:02:00.0: BAR 0: assigned [mem 0xf7400000-0xf75fffff 64bit]
ath10k_pci 0000:02:00.0: enabling device (0000 -> 0002)
ath10k_pci 0000:02:00.0: pci irq msi oper_irq_mode 2 irq_mode 0 reset_mode 0
ath10k_pci 0000:02:00.0: failed to read device register, device is gone
ath10k_pci 0000:02:00.0: failed to wait for target init: -5
ath10k_pci 0000:02:00.0: failed to warm reset: -5
ath10k_pci 0000:02:00.0: firmware crashed during chip reset
ath10k_pci 0000:02:00.0: firmware crashed! (uuid 5d018951-b8e1-404a-8fde-923078b4423a)
ath10k_pci 0000:02:00.0: (null) target 0x00000000 chip_id 0x00340aff sub 0000:0000
ath10k_pci 0000:02:00.0: kconfig debug 1 debugfs 1 tracing 1 dfs 1 testmode 1
ath10k_pci 0000:02:00.0: firmware ver  api 0 features  crc32 00000000
...
BUG: unable to handle kernel NULL pointer dereference at 00000004
...
Call Trace:
 [<fb4e7882>] ath10k_print_driver_info+0x12/0x20 [ath10k_core]
 [<fb62b7dd>] ath10k_pci_fw_crashed_dump+0x6d/0x4d0 [ath10k_pci]
 [<fb629f07>] ? ath10k_pci_sleep.part.19+0x57/0xc0 [ath10k_pci]
 [<fb62c8ee>] ath10k_pci_hif_power_up+0x14e/0x1b0 [ath10k_pci]
 [<c10477fb>] ? do_page_fault+0xb/0x10
 [<fb4eb934>] ath10k_core_register_work+0x24/0x840 [ath10k_core]
 [<c18a00d8>] ? netlbl_unlhsh_remove+0x178/0x410
 [<c10477f0>] ? __do_page_fault+0x480/0x480
 [<c1068e44>] process_one_work+0x114/0x3e0
 [<c1069d07>] worker_thread+0x37/0x4a0
 [<c106e294>] kthread+0xa4/0xc0
 [<c1069cd0>] ? create_worker+0x180/0x180
 [<c106e1f0>] ? kthread_park+0x50/0x50
 [<c18ab4f7>] ret_from_fork+0x1b/0x28
 Code: 78 80 b8 50 09 00 00 00 75 5d 8d 75 94 c7 44 24 08 aa d7 52 fb c7 44 24 04 64 00 00 00
 89 34 24 e8 82 52 e2 c5 8b 83 dc 08 00 00 <8b> 50 04 8b 08 31 c0 e8 20 57 e3 c5 89 44 24 10 8b 83 58 09 00
 EIP: [<fb4e7754>]-
 ath10k_debug_print_board_info+0x34/0xb0 [ath10k_core]
 SS:ESP 0068:f4921d90
 CR2: 0000000000000004

Signed-off-by: Yu Wang <yyuwang@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/debug.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/debug.c b/drivers/net/wireless/ath/ath10k/debug.c
index 6d836a26272fe..554cd7856cb6e 100644
--- a/drivers/net/wireless/ath/ath10k/debug.c
+++ b/drivers/net/wireless/ath/ath10k/debug.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2005-2011 Atheros Communications Inc.
  * Copyright (c) 2011-2017 Qualcomm Atheros, Inc.
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -81,6 +82,8 @@ void ath10k_debug_print_hwfw_info(struct ath10k *ar)
 void ath10k_debug_print_board_info(struct ath10k *ar)
 {
 	char boardinfo[100];
+	const struct firmware *board;
+	u32 crc;
 
 	if (ar->id.bmi_ids_valid)
 		scnprintf(boardinfo, sizeof(boardinfo), "%d:%d",
@@ -88,11 +91,16 @@ void ath10k_debug_print_board_info(struct ath10k *ar)
 	else
 		scnprintf(boardinfo, sizeof(boardinfo), "N/A");
 
+	board = ar->normal_mode_fw.board;
+	if (!IS_ERR_OR_NULL(board))
+		crc = crc32_le(0, board->data, board->size);
+	else
+		crc = 0;
+
 	ath10k_info(ar, "board_file api %d bmi_id %s crc32 %08x",
 		    ar->bd_api,
 		    boardinfo,
-		    crc32_le(0, ar->normal_mode_fw.board->data,
-			     ar->normal_mode_fw.board->size));
+		    crc);
 }
 
 void ath10k_debug_print_boot_info(struct ath10k *ar)

From 4e12d654ba068df06c5e4c8322d7dcced41e48ee Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <linux@rempel-privat.de>
Date: Tue, 30 Jan 2018 14:06:11 +0200
Subject: [PATCH 349/676] ath9k_htc: add Altai WA1011N-GU

as reported in:
https://github.com/qca/open-ath9k-htc-firmware/pull/71#issuecomment-361100070

Signed-off-by: Oleksij Rempel <linux@rempel-privat.de>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath9k/hif_usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c
index 56676eaff24c1..cb0eef13af1c8 100644
--- a/drivers/net/wireless/ath/ath9k/hif_usb.c
+++ b/drivers/net/wireless/ath/ath9k/hif_usb.c
@@ -24,6 +24,7 @@ static const struct usb_device_id ath9k_hif_usb_ids[] = {
 	{ USB_DEVICE(0x0cf3, 0x9271) }, /* Atheros */
 	{ USB_DEVICE(0x0cf3, 0x1006) }, /* Atheros */
 	{ USB_DEVICE(0x0846, 0x9030) }, /* Netgear N150 */
+	{ USB_DEVICE(0x07b8, 0x9271) }, /* Altai WA1011N-GU */
 	{ USB_DEVICE(0x07D1, 0x3A10) }, /* Dlink Wireless 150 */
 	{ USB_DEVICE(0x13D3, 0x3327) }, /* Azurewave */
 	{ USB_DEVICE(0x13D3, 0x3328) }, /* Azurewave */

From 2275cde4ccb319ae1eb1c6c717f0e547e62019ee Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 7 Feb 2018 09:13:04 -0500
Subject: [PATCH 350/676] SUNRPC: Queue latency-sensitive socket tasks to
 xprtiod

The response to a write_space notification is very latency sensitive,
so we should queue it to the lower latency xprtiod_workqueue. This
is something we already do for the other cases where an rpc task
holds the transport XPRT_LOCKED bitlock.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/sched.h |  3 +++
 net/sunrpc/sched.c           | 12 ++++++++++++
 net/sunrpc/xprt.c            |  3 ++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d96e74e114c06..592653becd914 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -229,6 +229,9 @@ void		rpc_sleep_on_priority(struct rpc_wait_queue *,
 					struct rpc_task *,
 					rpc_action action,
 					int priority);
+void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
+		struct rpc_wait_queue *queue,
+		struct rpc_task *task);
 void		rpc_wake_up_queued_task(struct rpc_wait_queue *,
 					struct rpc_task *);
 void		rpc_wake_up(struct rpc_wait_queue *);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 896691afbb1a8..96fdf6011c539 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -458,6 +458,18 @@ static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct r
 	rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task);
 }
 
+/*
+ * Wake up a task on a specific queue
+ */
+void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
+		struct rpc_wait_queue *queue,
+		struct rpc_task *task)
+{
+	spin_lock_bh(&queue->lock);
+	rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
+	spin_unlock_bh(&queue->lock);
+}
+
 /*
  * Wake up a task on a specific queue
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2436fd1125fc6..8f0ad4f268da5 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -517,7 +517,8 @@ void xprt_write_space(struct rpc_xprt *xprt)
 	if (xprt->snd_task) {
 		dprintk("RPC:       write space: waking waiting task on "
 				"xprt %p\n", xprt);
-		rpc_wake_up_queued_task(&xprt->pending, xprt->snd_task);
+		rpc_wake_up_queued_task_on_wq(xprtiod_workqueue,
+				&xprt->pending, xprt->snd_task);
 	}
 	spin_unlock_bh(&xprt->transport_lock);
 }

From e856f3a7d706b37d8be9b41e41b19f4919570e57 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Thu, 1 Feb 2018 10:48:52 +0100
Subject: [PATCH 351/676] coccinelle: devm_free: reduce false positives

Some files use both a non-devm allocation and a devm_allocation.  Don't
complain about a free when the same function contains a non-devm
allocation.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/coccinelle/free/devm_free.cocci | 55 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index c990d2c7ee161..b2a2cf8bf81fa 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -56,9 +56,62 @@ expression x;
  x = devm_ioport_map(...)
 )
 
+@safe depends on context || org || report exists@
+expression x;
+position p;
+@@
+
+(
+ x = kmalloc(...)
+|
+ x = kvasprintf(...)
+|
+ x = kasprintf(...)
+|
+ x = kzalloc(...)
+|
+ x = kmalloc_array(...)
+|
+ x = kcalloc(...)
+|
+ x = kstrdup(...)
+|
+ x = kmemdup(...)
+|
+ x = get_free_pages(...)
+|
+ x = request_irq(...)
+|
+ x = ioremap(...)
+|
+ x = ioremap_nocache(...)
+|
+ x = ioport_map(...)
+)
+...
+(
+ kfree@p(x)
+|
+ kzfree@p(x)
+|
+ __krealloc@p(x, ...)
+|
+ krealloc@p(x, ...)
+|
+ free_pages@p(x, ...)
+|
+ free_page@p(x)
+|
+ free_irq@p(x)
+|
+ iounmap@p(x)
+|
+ ioport_unmap@p(x)
+)
+
 @pb@
 expression r.x;
-position p;
+position p != safe.p;
 @@
 
 (

From f19fbd5ed642dc31c809596412dab1ed56f2f156 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 26 Jan 2018 12:46:47 +0100
Subject: [PATCH 352/676] s390: introduce execute-trampolines for branches

Add CONFIG_EXPOLINE to enable the use of the new -mindirect-branch= and
-mfunction_return= compiler options to create a kernel fortified against
the specte v2 attack.

With CONFIG_EXPOLINE=y all indirect branches will be issued with an
execute type instruction. For z10 or newer the EXRL instruction will
be used, for older machines the EX instruction. The typical indirect
call

	basr	%r14,%r1

is replaced with a PC relative call to a new thunk

	brasl	%r14,__s390x_indirect_jump_r1

The thunk contains the EXRL/EX instruction to the indirect branch

__s390x_indirect_jump_r1:
	exrl	0,0f
	j	.
0:	br	%r1

The detour via the execute type instruction has a performance impact.
To get rid of the detour the new kernel parameter "nospectre_v2" and
"spectre_v2=[on,off,auto]" can be used. If the parameter is specified
the kernel and module code will be patched at runtime.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig                     |  28 +++++++
 arch/s390/Makefile                    |  10 +++
 arch/s390/include/asm/lowcore.h       |   6 +-
 arch/s390/include/asm/nospec-branch.h |  18 ++++
 arch/s390/kernel/Makefile             |   4 +
 arch/s390/kernel/entry.S              | 113 ++++++++++++++++++++------
 arch/s390/kernel/module.c             |  62 ++++++++++++--
 arch/s390/kernel/nospec-branch.c      | 100 +++++++++++++++++++++++
 arch/s390/kernel/setup.c              |   4 +
 arch/s390/kernel/smp.c                |   1 +
 arch/s390/kernel/vmlinux.lds.S        |  14 ++++
 drivers/s390/char/Makefile            |   2 +
 12 files changed, 327 insertions(+), 35 deletions(-)
 create mode 100644 arch/s390/include/asm/nospec-branch.h
 create mode 100644 arch/s390/kernel/nospec-branch.c

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d514e25095c26..0636229dae227 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -557,6 +557,34 @@ config KERNEL_NOBP
 
 	  If unsure, say N.
 
+config EXPOLINE
+	def_bool n
+	prompt "Avoid speculative indirect branches in the kernel"
+	help
+	  Compile the kernel with the expoline compiler options to guard
+	  against kernel-to-user data leaks by avoiding speculative indirect
+	  branches.
+	  Requires a compiler with -mindirect-branch=thunk support for full
+	  protection. The kernel may run slower.
+
+	  If unsure, say N.
+
+choice
+	prompt "Expoline default"
+	depends on EXPOLINE
+	default EXPOLINE_FULL
+
+config EXPOLINE_OFF
+	bool "spectre_v2=off"
+
+config EXPOLINE_MEDIUM
+	bool "spectre_v2=auto"
+
+config EXPOLINE_FULL
+	bool "spectre_v2=on"
+
+endchoice
+
 endmenu
 
 menu "Memory setup"
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index fd691c4ff89ec..2ced3239cb847 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -78,6 +78,16 @@ ifeq ($(call cc-option-yn,-mwarn-dynamicstack),y)
 cflags-$(CONFIG_WARN_DYNAMIC_STACK) += -mwarn-dynamicstack
 endif
 
+ifdef CONFIG_EXPOLINE
+  ifeq ($(call cc-option-yn,$(CC_FLAGS_MARCH) -mindirect-branch=thunk),y)
+    CC_FLAGS_EXPOLINE := -mindirect-branch=thunk
+    CC_FLAGS_EXPOLINE += -mfunction-return=thunk
+    CC_FLAGS_EXPOLINE += -mindirect-branch-table
+    export CC_FLAGS_EXPOLINE
+    cflags-y += $(CC_FLAGS_EXPOLINE)
+  endif
+endif
+
 ifdef CONFIG_FUNCTION_TRACER
 # make use of hotpatch feature if the compiler supports it
 cc_hotpatch	:= -mhotpatch=0,3
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index c63986aee9427..5bc888841eafe 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -136,7 +136,11 @@ struct lowcore {
 	__u64	vdso_per_cpu_data;		/* 0x03b8 */
 	__u64	machine_flags;			/* 0x03c0 */
 	__u64	gmap;				/* 0x03c8 */
-	__u8	pad_0x03d0[0x0e00-0x03d0];	/* 0x03d0 */
+	__u8	pad_0x03d0[0x0400-0x03d0];	/* 0x03d0 */
+
+	/* br %r1 trampoline */
+	__u16	br_r1_trampoline;		/* 0x0400 */
+	__u8	pad_0x0402[0x0e00-0x0402];	/* 0x0402 */
 
 	/*
 	 * 0xe00 contains the address of the IPL Parameter Information
diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h
new file mode 100644
index 0000000000000..7df48e5cf36f7
--- /dev/null
+++ b/arch/s390/include/asm/nospec-branch.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_EXPOLINE_H
+#define _ASM_S390_EXPOLINE_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+extern int nospec_call_disable;
+extern int nospec_return_disable;
+
+void nospec_init_branches(void);
+void nospec_call_revert(s32 *start, s32 *end);
+void nospec_return_revert(s32 *start, s32 *end);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_S390_EXPOLINE_H */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 909bce65cb2bd..7f27e3da9709c 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -29,6 +29,7 @@ UBSAN_SANITIZE_early.o	:= n
 #
 ifneq ($(CC_FLAGS_MARCH),-march=z900)
 CFLAGS_REMOVE_als.o	+= $(CC_FLAGS_MARCH)
+CFLAGS_REMOVE_als.o	+= $(CC_FLAGS_EXPOLINE)
 CFLAGS_als.o		+= -march=z900
 AFLAGS_REMOVE_head.o	+= $(CC_FLAGS_MARCH)
 AFLAGS_head.o		+= -march=z900
@@ -63,6 +64,9 @@ obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 
 extra-y				+= head.o head64.o vmlinux.lds
 
+obj-$(CONFIG_EXPOLINE)		+= nospec-branch.o
+CFLAGS_REMOVE_expoline.o	+= $(CC_FLAGS_EXPOLINE)
+
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SCHED_TOPOLOGY)	+= topology.o
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 53145b5d987de..13a133a6015c9 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -222,6 +222,68 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 	.popsection
 	.endm
 
+#ifdef CONFIG_EXPOLINE
+
+	.macro GEN_BR_THUNK name,reg,tmp
+	.section .text.\name,"axG",@progbits,\name,comdat
+	.globl \name
+	.hidden \name
+	.type \name,@function
+\name:
+	.cfi_startproc
+#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
+	exrl	0,0f
+#else
+	larl	\tmp,0f
+	ex	0,0(\tmp)
+#endif
+	j	.
+0:	br	\reg
+	.cfi_endproc
+	.endm
+
+	GEN_BR_THUNK __s390x_indirect_jump_r1use_r9,%r9,%r1
+	GEN_BR_THUNK __s390x_indirect_jump_r1use_r14,%r14,%r1
+	GEN_BR_THUNK __s390x_indirect_jump_r11use_r14,%r14,%r11
+
+	.macro BASR_R14_R9
+0:	brasl	%r14,__s390x_indirect_jump_r1use_r9
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	0b-.
+	.popsection
+	.endm
+
+	.macro BR_R1USE_R14
+0:	jg	__s390x_indirect_jump_r1use_r14
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	0b-.
+	.popsection
+	.endm
+
+	.macro BR_R11USE_R14
+0:	jg	__s390x_indirect_jump_r11use_r14
+	.pushsection .s390_indirect_branches,"a",@progbits
+	.long	0b-.
+	.popsection
+	.endm
+
+#else	/* CONFIG_EXPOLINE */
+
+	.macro BASR_R14_R9
+	basr	%r14,%r9
+	.endm
+
+	.macro BR_R1USE_R14
+	br	%r14
+	.endm
+
+	.macro BR_R11USE_R14
+	br	%r14
+	.endm
+
+#endif /* CONFIG_EXPOLINE */
+
+
 	.section .kprobes.text, "ax"
 .Ldummy:
 	/*
@@ -237,7 +299,7 @@ _PIF_WORK	= (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART)
 ENTRY(__bpon)
 	.globl __bpon
 	BPON
-	br	%r14
+	BR_R1USE_R14
 
 /*
  * Scheduler resume function, called by switch_to
@@ -261,9 +323,9 @@ ENTRY(__switch_to)
 	mvc	__LC_CURRENT_PID(4,%r0),0(%r3)	# store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP
-	bzr	%r14
+	jz	0f
 	.insn	s,0xb2800000,__LC_LPP		# set program parameter
-	br	%r14
+0:	BR_R1USE_R14
 
 .L__critical_start:
 
@@ -330,7 +392,7 @@ sie_exit:
 	xgr	%r5,%r5
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
 	lg	%r2,__SF_EMPTY+16(%r15)		# return exit reason code
-	br	%r14
+	BR_R1USE_R14
 .Lsie_fault:
 	lghi	%r14,-EFAULT
 	stg	%r14,__SF_EMPTY+16(%r15)	# set exit reason code
@@ -389,7 +451,7 @@ ENTRY(system_call)
 	lgf	%r9,0(%r8,%r10)			# get system call add.
 	TSTMSK	__TI_flags(%r12),_TIF_TRACE
 	jnz	.Lsysc_tracesys
-	basr	%r14,%r9			# call sys_xxxx
+	BASR_R14_R9				# call sys_xxxx
 	stg	%r2,__PT_R2(%r11)		# store return value
 
 .Lsysc_return:
@@ -574,7 +636,7 @@ ENTRY(system_call)
 	lmg	%r3,%r7,__PT_R3(%r11)
 	stg	%r7,STACK_FRAME_OVERHEAD(%r15)
 	lg	%r2,__PT_ORIG_GPR2(%r11)
-	basr	%r14,%r9		# call sys_xxx
+	BASR_R14_R9			# call sys_xxx
 	stg	%r2,__PT_R2(%r11)	# store return value
 .Lsysc_tracenogo:
 	TSTMSK	__TI_flags(%r12),_TIF_TRACE
@@ -598,7 +660,7 @@ ENTRY(ret_from_fork)
 	lmg	%r9,%r10,__PT_R9(%r11)	# load gprs
 ENTRY(kernel_thread_starter)
 	la	%r2,0(%r10)
-	basr	%r14,%r9
+	BASR_R14_R9
 	j	.Lsysc_tracenogo
 
 /*
@@ -678,9 +740,9 @@ ENTRY(pgm_check_handler)
 	nill	%r10,0x007f
 	sll	%r10,2
 	je	.Lpgm_return
-	lgf	%r1,0(%r10,%r1)		# load address of handler routine
+	lgf	%r9,0(%r10,%r1)		# load address of handler routine
 	lgr	%r2,%r11		# pass pointer to pt_regs
-	basr	%r14,%r1		# branch to interrupt-handler
+	BASR_R14_R9			# branch to interrupt-handler
 .Lpgm_return:
 	LOCKDEP_SYS_EXIT
 	tm	__PT_PSW+1(%r11),0x01	# returning to user ?
@@ -998,7 +1060,7 @@ ENTRY(psw_idle)
 	stpt	__TIMER_IDLE_ENTER(%r2)
 .Lpsw_idle_lpsw:
 	lpswe	__SF_EMPTY(%r15)
-	br	%r14
+	BR_R1USE_R14
 .Lpsw_idle_end:
 
 /*
@@ -1012,7 +1074,7 @@ ENTRY(save_fpu_regs)
 	lg	%r2,__LC_CURRENT
 	aghi	%r2,__TASK_thread
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	bor	%r14
+	jo	.Lsave_fpu_regs_exit
 	stfpc	__THREAD_FPU_fpc(%r2)
 	lg	%r3,__THREAD_FPU_regs(%r2)
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
@@ -1039,7 +1101,8 @@ ENTRY(save_fpu_regs)
 	std	15,120(%r3)
 .Lsave_fpu_regs_done:
 	oi	__LC_CPU_FLAGS+7,_CIF_FPU
-	br	%r14
+.Lsave_fpu_regs_exit:
+	BR_R1USE_R14
 .Lsave_fpu_regs_end:
 EXPORT_SYMBOL(save_fpu_regs)
 
@@ -1057,7 +1120,7 @@ load_fpu_regs:
 	lg	%r4,__LC_CURRENT
 	aghi	%r4,__TASK_thread
 	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
-	bnor	%r14
+	jno	.Lload_fpu_regs_exit
 	lfpc	__THREAD_FPU_fpc(%r4)
 	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
@@ -1084,7 +1147,8 @@ load_fpu_regs:
 	ld	15,120(%r4)
 .Lload_fpu_regs_done:
 	ni	__LC_CPU_FLAGS+7,255-_CIF_FPU
-	br	%r14
+.Lload_fpu_regs_exit:
+	BR_R1USE_R14
 .Lload_fpu_regs_end:
 
 .L__critical_end:
@@ -1301,7 +1365,7 @@ cleanup_critical:
 	jl	0f
 	clg	%r9,BASED(.Lcleanup_table+104)	# .Lload_fpu_regs_end
 	jl	.Lcleanup_load_fpu_regs
-0:	br	%r14
+0:	BR_R11USE_R14
 
 	.align	8
 .Lcleanup_table:
@@ -1337,7 +1401,7 @@ cleanup_critical:
 	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
 	larl	%r9,sie_exit			# skip forward to sie_exit
-	br	%r14
+	BR_R11USE_R14
 #endif
 
 .Lcleanup_system_call:
@@ -1390,7 +1454,7 @@ cleanup_critical:
 	stg	%r15,56(%r11)		# r15 stack pointer
 	# set new psw address and exit
 	larl	%r9,.Lsysc_do_svc
-	br	%r14
+	BR_R11USE_R14
 .Lcleanup_system_call_insn:
 	.quad	system_call
 	.quad	.Lsysc_stmg
@@ -1402,7 +1466,7 @@ cleanup_critical:
 
 .Lcleanup_sysc_tif:
 	larl	%r9,.Lsysc_tif
-	br	%r14
+	BR_R11USE_R14
 
 .Lcleanup_sysc_restore:
 	# check if stpt has been executed
@@ -1419,14 +1483,14 @@ cleanup_critical:
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
 1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	br	%r14
+	BR_R11USE_R14
 .Lcleanup_sysc_restore_insn:
 	.quad	.Lsysc_exit_timer
 	.quad	.Lsysc_done - 4
 
 .Lcleanup_io_tif:
 	larl	%r9,.Lio_tif
-	br	%r14
+	BR_R11USE_R14
 
 .Lcleanup_io_restore:
 	# check if stpt has been executed
@@ -1440,7 +1504,7 @@ cleanup_critical:
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
 1:	lmg	%r8,%r9,__LC_RETURN_PSW
-	br	%r14
+	BR_R11USE_R14
 .Lcleanup_io_restore_insn:
 	.quad	.Lio_exit_timer
 	.quad	.Lio_done - 4
@@ -1493,17 +1557,17 @@ cleanup_critical:
 	# prepare return psw
 	nihh	%r8,0xfcfd		# clear irq & wait state bits
 	lg	%r9,48(%r11)		# return from psw_idle
-	br	%r14
+	BR_R11USE_R14
 .Lcleanup_idle_insn:
 	.quad	.Lpsw_idle_lpsw
 
 .Lcleanup_save_fpu_regs:
 	larl	%r9,save_fpu_regs
-	br	%r14
+	BR_R11USE_R14
 
 .Lcleanup_load_fpu_regs:
 	larl	%r9,load_fpu_regs
-	br	%r14
+	BR_R11USE_R14
 
 /*
  * Integer constants
@@ -1523,7 +1587,6 @@ cleanup_critical:
 .Lsie_crit_mcck_length:
 	.quad   .Lsie_skip - .Lsie_entry
 #endif
-
 	.section .rodata, "a"
 #define SYSCALL(esame,emu)	.long esame
 	.globl	sys_call_table
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index b7abfad4fd7df..1fc6d1ff92d3e 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -19,6 +19,8 @@
 #include <linux/moduleloader.h>
 #include <linux/bug.h>
 #include <asm/alternative.h>
+#include <asm/nospec-branch.h>
+#include <asm/facility.h>
 
 #if 0
 #define DEBUGP printk
@@ -156,7 +158,11 @@ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 	me->arch.got_offset = me->core_layout.size;
 	me->core_layout.size += me->arch.got_size;
 	me->arch.plt_offset = me->core_layout.size;
-	me->core_layout.size += me->arch.plt_size;
+	if (me->arch.plt_size) {
+		if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_call_disable)
+			me->arch.plt_size += PLT_ENTRY_SIZE;
+		me->core_layout.size += me->arch.plt_size;
+	}
 	return 0;
 }
 
@@ -310,9 +316,21 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
 			unsigned int *ip;
 			ip = me->core_layout.base + me->arch.plt_offset +
 				info->plt_offset;
-			ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
-			ip[1] = 0x100a0004;
-			ip[2] = 0x07f10000;
+			ip[0] = 0x0d10e310;	/* basr 1,0  */
+			ip[1] = 0x100a0004;	/* lg	1,10(1) */
+			if (IS_ENABLED(CONFIG_EXPOLINE) &&
+			    !nospec_call_disable) {
+				unsigned int *ij;
+				ij = me->core_layout.base +
+					me->arch.plt_offset +
+					me->arch.plt_size - PLT_ENTRY_SIZE;
+				ip[2] = 0xa7f40000 +	/* j __jump_r1 */
+					(unsigned int)(u16)
+					(((unsigned long) ij - 8 -
+					  (unsigned long) ip) / 2);
+			} else {
+				ip[2] = 0x07f10000;	/* br %r1 */
+			}
 			ip[3] = (unsigned int) (val >> 32);
 			ip[4] = (unsigned int) val;
 			info->plt_initialized = 1;
@@ -418,16 +436,42 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    struct module *me)
 {
 	const Elf_Shdr *s;
-	char *secstrings;
+	char *secstrings, *secname;
+	void *aseg;
+
+	if (IS_ENABLED(CONFIG_EXPOLINE) &&
+	    !nospec_call_disable && me->arch.plt_size) {
+		unsigned int *ij;
+
+		ij = me->core_layout.base + me->arch.plt_offset +
+			me->arch.plt_size - PLT_ENTRY_SIZE;
+		if (test_facility(35)) {
+			ij[0] = 0xc6000000;	/* exrl	%r0,.+10	*/
+			ij[1] = 0x0005a7f4;	/* j	.		*/
+			ij[2] = 0x000007f1;	/* br	%r1		*/
+		} else {
+			ij[0] = 0x44000000 | (unsigned int)
+				offsetof(struct lowcore, br_r1_trampoline);
+			ij[1] = 0xa7f40000;	/* j	.		*/
+		}
+	}
 
 	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
-		if (!strcmp(".altinstructions", secstrings + s->sh_name)) {
-			/* patch .altinstructions */
-			void *aseg = (void *)s->sh_addr;
+		aseg = (void *) s->sh_addr;
+		secname = secstrings + s->sh_name;
 
+		if (!strcmp(".altinstructions", secname))
+			/* patch .altinstructions */
 			apply_alternatives(aseg, aseg + s->sh_size);
-		}
+
+		if (IS_ENABLED(CONFIG_EXPOLINE) &&
+		    (!strcmp(".nospec_call_table", secname)))
+			nospec_call_revert(aseg, aseg + s->sh_size);
+
+		if (IS_ENABLED(CONFIG_EXPOLINE) &&
+		    (!strcmp(".nospec_return_table", secname)))
+			nospec_return_revert(aseg, aseg + s->sh_size);
 	}
 
 	jump_label_apply_nops(me);
diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c
new file mode 100644
index 0000000000000..69d7fcf481588
--- /dev/null
+++ b/arch/s390/kernel/nospec-branch.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/module.h>
+#include <asm/nospec-branch.h>
+
+int nospec_call_disable = IS_ENABLED(EXPOLINE_OFF);
+int nospec_return_disable = !IS_ENABLED(EXPOLINE_FULL);
+
+static int __init nospectre_v2_setup_early(char *str)
+{
+	nospec_call_disable = 1;
+	nospec_return_disable = 1;
+	return 0;
+}
+early_param("nospectre_v2", nospectre_v2_setup_early);
+
+static int __init spectre_v2_setup_early(char *str)
+{
+	if (str && !strncmp(str, "on", 2)) {
+		nospec_call_disable = 0;
+		nospec_return_disable = 0;
+	}
+	if (str && !strncmp(str, "off", 3)) {
+		nospec_call_disable = 1;
+		nospec_return_disable = 1;
+	}
+	if (str && !strncmp(str, "auto", 4)) {
+		nospec_call_disable = 0;
+		nospec_return_disable = 1;
+	}
+	return 0;
+}
+early_param("spectre_v2", spectre_v2_setup_early);
+
+static void __init_or_module __nospec_revert(s32 *start, s32 *end)
+{
+	enum { BRCL_EXPOLINE, BRASL_EXPOLINE } type;
+	u8 *instr, *thunk, *br;
+	u8 insnbuf[6];
+	s32 *epo;
+
+	/* Second part of the instruction replace is always a nop */
+	memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x00, 0x00 }, 4);
+	for (epo = start; epo < end; epo++) {
+		instr = (u8 *) epo + *epo;
+		if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04)
+			type = BRCL_EXPOLINE;	/* brcl instruction */
+		else if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x05)
+			type = BRASL_EXPOLINE;	/* brasl instruction */
+		else
+			continue;
+		thunk = instr + (*(int *)(instr + 2)) * 2;
+		if (thunk[0] == 0xc6 && thunk[1] == 0x00)
+			/* exrl %r0,<target-br> */
+			br = thunk + (*(int *)(thunk + 2)) * 2;
+		else if (thunk[0] == 0xc0 && (thunk[1] & 0x0f) == 0x00 &&
+			 thunk[6] == 0x44 && thunk[7] == 0x00 &&
+			 (thunk[8] & 0x0f) == 0x00 && thunk[9] == 0x00 &&
+			 (thunk[1] & 0xf0) == (thunk[8] & 0xf0))
+			/* larl %rx,<target br> + ex %r0,0(%rx) */
+			br = thunk + (*(int *)(thunk + 2)) * 2;
+		else
+			continue;
+		if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0)
+			continue;
+		switch (type) {
+		case BRCL_EXPOLINE:
+			/* brcl to thunk, replace with br + nop */
+			insnbuf[0] = br[0];
+			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
+			break;
+		case BRASL_EXPOLINE:
+			/* brasl to thunk, replace with basr + nop */
+			insnbuf[0] = 0x0d;
+			insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f);
+			break;
+		}
+
+		s390_kernel_write(instr, insnbuf, 6);
+	}
+}
+
+void __init_or_module nospec_call_revert(s32 *start, s32 *end)
+{
+	if (nospec_call_disable)
+		__nospec_revert(start, end);
+}
+
+void __init_or_module nospec_return_revert(s32 *start, s32 *end)
+{
+	if (nospec_return_disable)
+		__nospec_revert(start, end);
+}
+
+extern s32 __nospec_call_start[], __nospec_call_end[];
+extern s32 __nospec_return_start[], __nospec_return_end[];
+void __init nospec_init_branches(void)
+{
+	nospec_call_revert(__nospec_call_start, __nospec_call_end);
+	nospec_return_revert(__nospec_return_start, __nospec_return_end);
+}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index bcd2a4a3937e8..a6a91f01a17a3 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -68,6 +68,7 @@
 #include <asm/sysinfo.h>
 #include <asm/numa.h>
 #include <asm/alternative.h>
+#include <asm/nospec-branch.h>
 #include "entry.h"
 
 /*
@@ -379,6 +380,7 @@ static void __init setup_lowcore(void)
 	lc->spinlock_index = 0;
 	arch_spin_lock_setup(0);
 #endif
+	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 
 	set_prefix((u32)(unsigned long) lc);
 	lowcore_ptr[0] = lc;
@@ -954,6 +956,8 @@ void __init setup_arch(char **cmdline_p)
 	set_preferred_console();
 
 	apply_alternative_instructions();
+	if (IS_ENABLED(CONFIG_EXPOLINE))
+		nospec_init_branches();
 
 	/* Setup zfcpdump support */
 	setup_zfcpdump();
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2fd7d609dae08..a4a9fe1934e9f 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -214,6 +214,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
 	lc->spinlock_index = 0;
+	lc->br_r1_trampoline = 0x07f1;	/* br %r1 */
 	if (nmi_alloc_per_cpu(lc))
 		goto out;
 	if (vdso_alloc_per_cpu(lc))
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 608cf2987d196..08d12cfaf0918 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -123,6 +123,20 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 
+	/*
+	 * Table with the patch locations to undo expolines
+	*/
+	.nospec_call_table : {
+		__nospec_call_start = . ;
+		*(.s390_indirect*)
+		__nospec_call_end = . ;
+	}
+	.nospec_return_table : {
+		__nospec_return_start = . ;
+		*(.s390_return*)
+		__nospec_return_end = . ;
+	}
+
 	/* early.c uses stsi, which requires page aligned data. */
 	. = ALIGN(PAGE_SIZE);
 	INIT_DATA_SECTION(0x100)
diff --git a/drivers/s390/char/Makefile b/drivers/s390/char/Makefile
index 614b44e70a281..a2b33a22c82a8 100644
--- a/drivers/s390/char/Makefile
+++ b/drivers/s390/char/Makefile
@@ -19,6 +19,8 @@ endif
 
 CFLAGS_sclp_early_core.o		+= -D__NO_FORTIFY
 
+CFLAGS_REMOVE_sclp_early_core.o	+= $(CC_FLAGS_EXPOLINE)
+
 obj-y += ctrlchar.o keyboard.o defkeymap.o sclp.o sclp_rw.o sclp_quiesce.o \
 	 sclp_cmd.o sclp_config.o sclp_cpi_sys.o sclp_ocf.o sclp_ctl.o \
 	 sclp_early.o sclp_early_core.o

From a2b0fe7435faee6f6fbb27409878013bc4727e98 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sat, 3 Feb 2018 08:44:58 +0100
Subject: [PATCH 353/676] coccinelle: deref_null: avoid useless computation

The effect of the rules ifm1, pr11, and pr12 is only used in the final rule,
which depends on context && !org && !report.  Thus these rules should only
be performed in those circumstances.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/coccinelle/null/deref_null.cocci | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/coccinelle/null/deref_null.cocci b/scripts/coccinelle/null/deref_null.cocci
index f192d6035d023..b16ccb7663a7d 100644
--- a/scripts/coccinelle/null/deref_null.cocci
+++ b/scripts/coccinelle/null/deref_null.cocci
@@ -212,7 +212,7 @@ else S3
 // The following three rules are duplicates of ifm, pr1 and pr2 respectively.
 // It is need because the previous rule as already made a "change".
 
-@ifm1@
+@ifm1 depends on context && !org && !report@
 expression *E;
 statement S1,S2;
 position p1;
@@ -220,7 +220,7 @@ position p1;
 
 if@p1 ((E == NULL && ...) || ...) S1 else S2
 
-@pr11 expression@
+@pr11 depends on context && !org && !report expression@
 expression *ifm1.E;
 identifier f;
 position p1;
@@ -228,7 +228,7 @@ position p1;
 
  (E != NULL && ...) ? <+...E->f@p1...+> : ...
 
-@pr12 expression@
+@pr12 depends on context && !org && !report expression@
 expression *ifm1.E;
 identifier f;
 position p2;

From ede2e520a1484924a55c8fc77a8256f5d33d2ab2 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 31 Jan 2018 14:34:37 -0600
Subject: [PATCH 354/676] Add some missing debug fields in server and tcon
 structs

Allow dumping out debug information on dialect, signing, unix extensions
and encryption

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/cifs_debug.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index c7a863219fa36..e35e711db68ec 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -128,6 +128,10 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon)
 		seq_puts(m, " type: CDROM ");
 	else
 		seq_printf(m, " type: %d ", dev_type);
+	if (tcon->seal)
+		seq_printf(m, " Encrypted");
+	if (tcon->unix_ext)
+		seq_printf(m, " POSIX Extensions");
 	if (tcon->ses->server->ops->dump_share_caps)
 		tcon->ses->server->ops->dump_share_caps(m, tcon);
 
@@ -246,7 +250,10 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 			atomic_read(&server->smbd_conn->mr_used_count));
 skip_rdma:
 #endif
-		seq_printf(m, "\nNumber of credits: %d", server->credits);
+		seq_printf(m, "\nNumber of credits: %d Dialect 0x%x",
+			server->credits,  server->dialect);
+		if (server->sign)
+			seq_printf(m, " signed");
 		i++;
 		list_for_each(tmp2, &server->smb_ses_list) {
 			ses = list_entry(tmp2, struct cifs_ses,

From ade7db991b47ab3016a414468164f4966bd08202 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Feb 2018 16:48:47 +0100
Subject: [PATCH 355/676] cifs: silence compiler warnings showing up with
 gcc-8.0.0

This bug was fixed before, but came up again with the latest
compiler in another function:

fs/cifs/cifssmb.c: In function 'CIFSSMBSetEA':
fs/cifs/cifssmb.c:6362:3: error: 'strncpy' offset 8 is out of the bounds [0, 4] [-Werror=array-bounds]
   strncpy(parm_data->list[0].name, ea_name, name_len);

Let's apply the same fix that was used for the other instances.

Fixes: b2a3ad9ca502 ("cifs: silence compiler warnings showing up with gcc-4.7.0")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifssmb.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4e0922d24eb21..9ceebf30eb22e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -6343,9 +6343,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->InformationLevel =
 		cpu_to_le16(SMB_SET_FILE_EA);
 
-	parm_data =
-		(struct fealist *) (((char *) &pSMB->hdr.Protocol) +
-				       offset);
+	parm_data = (void *)pSMB + offsetof(struct smb_hdr, Protocol) + offset;
 	pSMB->ParameterOffset = cpu_to_le16(param_offset);
 	pSMB->DataOffset = cpu_to_le16(offset);
 	pSMB->SetupCount = 1;

From f9de151bf2b8055563b85ff6896fd86891bbe6c5 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sat, 3 Feb 2018 19:45:07 -0600
Subject: [PATCH 356/676] address lock imbalance warnings in smbdirect.c

Although at least one of these was an overly strict sparse warning
in the new smbdirect code, it is cleaner to fix - so no warnings.

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/smbdirect.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5130492847eb1..91710eb571fb3 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -217,9 +217,10 @@ static void smbd_destroy_rdma_work(struct work_struct *work)
 			spin_unlock_irqrestore(
 				&info->reassembly_queue_lock, flags);
 			put_receive_buffer(info, response);
-		}
+		} else
+			spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
 	} while (response);
-	spin_unlock_irqrestore(&info->reassembly_queue_lock, flags);
+
 	info->reassembly_data_length = 0;
 
 	log_rdma_event(INFO, "free receive buffers\n");
@@ -1934,15 +1935,16 @@ static int smbd_recv_buf(struct smbd_connection *info, char *buf,
 				 * No need to lock if we are not at the
 				 * end of the queue
 				 */
-				if (!queue_length)
+				if (queue_length)
+					list_del(&response->list);
+				else {
 					spin_lock_irq(
 						&info->reassembly_queue_lock);
-				list_del(&response->list);
-				queue_removed++;
-				if (!queue_length)
+					list_del(&response->list);
 					spin_unlock_irq(
 						&info->reassembly_queue_lock);
-
+				}
+				queue_removed++;
 				info->count_reassembly_queue--;
 				info->count_dequeue_reassembly_queue++;
 				put_receive_buffer(info, response);

From 5f60a56494ea5518376b274dd93b3ceee9a783fb Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 5 Feb 2018 14:46:18 -0600
Subject: [PATCH 357/676] Add missing structs and defines from recent SMB3.1.1
 documentation

The last two updates to MS-SMB2 protocol documentation added various
flags and structs (especially relating to SMB3.1.1 tree connect).
Add missing defines and structs to smb2pdu.h

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
---
 fs/cifs/smb2pdu.h | 114 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 6eb9f9691ed40..2a2b34ccaf492 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -192,6 +192,35 @@ struct smb2_symlink_err_rsp {
 	__u8  PathBuffer[0];
 } __packed;
 
+/* SMB 3.1.1 and later dialects. See MS-SMB2 section 2.2.2.1 */
+struct smb2_error_context_rsp {
+	__le32 ErrorDataLength;
+	__le32 ErrorId;
+	__u8  ErrorContextData; /* ErrorDataLength long array */
+} __packed;
+
+/* Defines for Type field below (see MS-SMB2 2.2.2.2.2.1) */
+#define MOVE_DST_IPADDR_V4	cpu_to_le32(0x00000001)
+#define MOVE_DST_IPADDR_V6	cpu_to_le32(0x00000002)
+
+struct move_dst_ipaddr {
+	__le32 Type;
+	__u32  Reserved;
+	__u8   address[16]; /* IPv4 followed by 12 bytes rsvd or IPv6 address */
+} __packed;
+
+struct share_redirect_error_context_rsp {
+	__le32 StructureSize;
+	__le32 NotificationType;
+	__le32 ResourceNameOffset;
+	__le32 ResourceNameLength;
+	__le16 Flags;
+	__le16 TargetType;
+	__le32 IPAddrCount;
+	struct move_dst_ipaddr IpAddrMoveList[0];
+	/* __u8 ResourceName[] */ /* Name of share as counted Unicode string */
+} __packed;
+
 #define SMB2_CLIENT_GUID_SIZE 16
 
 struct smb2_negotiate_req {
@@ -320,7 +349,9 @@ struct smb2_logoff_rsp {
 } __packed;
 
 /* Flags/Reserved for SMB3.1.1 */
-#define SMB2_SHAREFLAG_CLUSTER_RECONNECT	0x0001
+#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001)
+#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002)
+#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004)
 
 struct smb2_tree_connect_req {
 	struct smb2_sync_hdr sync_hdr;
@@ -331,6 +362,82 @@ struct smb2_tree_connect_req {
 	__u8   Buffer[1];	/* variable length */
 } __packed;
 
+/* See MS-SMB2 section 2.2.9.2 */
+/* Context Types */
+#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000
+#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001)
+
+struct tree_connect_contexts {
+	__le16 ContextType;
+	__le16 DataLength;
+	__le32 Reserved;
+	__u8   Data[0];
+} __packed;
+
+/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */
+struct smb3_blob_data {
+	__le16 BlobSize;
+	__u8   BlobData[0];
+} __packed;
+
+/* Valid values for Attr */
+#define SE_GROUP_MANDATORY		0x00000001
+#define SE_GROUP_ENABLED_BY_DEFAULT	0x00000002
+#define SE_GROUP_ENABLED		0x00000004
+#define SE_GROUP_OWNER			0x00000008
+#define SE_GROUP_USE_FOR_DENY_ONLY	0x00000010
+#define SE_GROUP_INTEGRITY		0x00000020
+#define SE_GROUP_INTEGRITY_ENABLED	0x00000040
+#define SE_GROUP_RESOURCE		0x20000000
+#define SE_GROUP_LOGON_ID		0xC0000000
+
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+
+struct sid_array_data {
+	__le16 SidAttrCount;
+	/* SidAttrList - array of sid_attr_data structs */
+} __packed;
+
+struct luid_attr_data {
+
+} __packed;
+
+/*
+ * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5
+ * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA
+ */
+
+struct privilege_array_data {
+	__le16 PrivilegeCount;
+	/* array of privilege_data structs */
+} __packed;
+
+struct remoted_identity_tcon_context {
+	__le16 TicketType; /* must be 0x0001 */
+	__le16 TicketSize; /* total size of this struct */
+	__le16 User; /* offset to SID_ATTR_DATA struct with user info */
+	__le16 UserName; /* offset to null terminated Unicode username string */
+	__le16 Domain; /* offset to null terminated Unicode domain name */
+	__le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */
+	__le16 RestrictedGroups; /* similar to above */
+	__le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */
+	__le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */
+	__le16 Owner; /* offset to BLOB_DATA struct */
+	__le16 DefaultDacl; /* offset to BLOB_DATA struct */
+	__le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */
+	__le16 UserClaims; /* offset to BLOB_DATA struct */
+	__le16 DeviceClaims; /* offset to BLOB_DATA struct */
+	__u8   TicketInfo[0]; /* variable length buf - remoted identity data */
+} __packed;
+
+struct smb2_tree_connect_req_extension {
+	__le32 TreeConnectContextOffset;
+	__le16 TreeConnectContextCount;
+	__u8  Reserved[10];
+	__u8  PathName[0]; /* variable sized array */
+	/* followed by array of TreeConnectContexts */
+} __packed;
+
 struct smb2_tree_connect_rsp {
 	struct smb2_hdr hdr;
 	__le16 StructureSize;	/* Must be 16 */
@@ -365,7 +472,8 @@ struct smb2_tree_connect_rsp {
 #define SHI1005_FLAGS_ENABLE_HASH_V1			0x00002000
 #define SHI1005_FLAGS_ENABLE_HASH_V2			0x00004000
 #define SHI1005_FLAGS_ENCRYPT_DATA			0x00008000
-#define SHI1005_FLAGS_ALL				0x0000FF33
+#define SMB2_SHAREFLAG_IDENTITY_REMOTING		0x00040000 /* 3.1.1 */
+#define SHI1005_FLAGS_ALL				0x0004FF33
 
 /* Possible share capabilities */
 #define SMB2_SHARE_CAP_DFS	cpu_to_le32(0x00000008) /* all dialects */
@@ -373,6 +481,7 @@ struct smb2_tree_connect_rsp {
 #define SMB2_SHARE_CAP_SCALEOUT	cpu_to_le32(0x00000020) /* 3.0 */
 #define SMB2_SHARE_CAP_CLUSTER	cpu_to_le32(0x00000040) /* 3.0 */
 #define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */
+#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */
 
 struct smb2_tree_disconnect_req {
 	struct smb2_sync_hdr sync_hdr;
@@ -556,6 +665,7 @@ struct create_context {
 #define SMB2_LEASE_WRITE_CACHING	cpu_to_le32(0x04)
 
 #define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02)
+#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET cpu_to_le32(0x00000004)
 
 #define SMB2_LEASE_KEY_SIZE 16
 

From a728eacbbdd229d1d903e46261c57d5206f87a4a Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Wed, 7 Feb 2018 11:41:39 -0800
Subject: [PATCH 358/676] bcache: add journal statistic

Sometimes, Journal takes up a lot of CPU, we need statistics
to know what's the journal is doing. So this patch provide
some journal statistics:
1) reclaim: how many times the journal try to reclaim resource,
   usually the journal bucket or/and the pin are exhausted.
2) flush_write: how many times the journal try to flush btree node
   to cache device, usually the journal bucket are exhausted.
3) retry_flush_write: how many times the journal retry to flush
   the next btree node, usually the previous tree node have been
   flushed by other thread.
we show these statistic by sysfs interface. Through these statistics
We can totally see the status of journal module when the CPU is too
high.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h  |  4 ++++
 drivers/md/bcache/journal.c |  5 +++++
 drivers/md/bcache/sysfs.c   | 15 +++++++++++++++
 3 files changed, 24 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5e2d4e80198e5..b98d7705acb67 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -658,6 +658,10 @@ struct cache_set {
 	atomic_long_t		writeback_keys_done;
 	atomic_long_t		writeback_keys_failed;
 
+	atomic_long_t		reclaim;
+	atomic_long_t		flush_write;
+	atomic_long_t		retry_flush_write;
+
 	enum			{
 		ON_ERROR_UNREGISTER,
 		ON_ERROR_PANIC,
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index a87165c1d8e52..f5296007a9d50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -377,6 +377,8 @@ static void btree_flush_write(struct cache_set *c)
 	 */
 	struct btree *b, *best;
 	unsigned i;
+
+	atomic_long_inc(&c->flush_write);
 retry:
 	best = NULL;
 
@@ -397,6 +399,7 @@ static void btree_flush_write(struct cache_set *c)
 		if (!btree_current_write(b)->journal) {
 			mutex_unlock(&b->write_lock);
 			/* We raced */
+			atomic_long_inc(&c->retry_flush_write);
 			goto retry;
 		}
 
@@ -476,6 +479,8 @@ static void journal_reclaim(struct cache_set *c)
 	unsigned iter, n = 0;
 	atomic_t p;
 
+	atomic_long_inc(&c->reclaim);
+
 	while (!atomic_read(&fifo_front(&c->journal.pin)))
 		fifo_pop(&c->journal.pin, p);
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b4184092c7279..46e7a6b3e7c7d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -65,6 +65,9 @@ read_attribute(bset_tree_stats);
 
 read_attribute(state);
 read_attribute(cache_read_races);
+read_attribute(reclaim);
+read_attribute(flush_write);
+read_attribute(retry_flush_write);
 read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
 read_attribute(io_errors);
@@ -545,6 +548,15 @@ SHOW(__bch_cache_set)
 	sysfs_print(cache_read_races,
 		    atomic_long_read(&c->cache_read_races));
 
+	sysfs_print(reclaim,
+		    atomic_long_read(&c->reclaim));
+
+	sysfs_print(flush_write,
+		    atomic_long_read(&c->flush_write));
+
+	sysfs_print(retry_flush_write,
+		    atomic_long_read(&c->retry_flush_write));
+
 	sysfs_print(writeback_keys_done,
 		    atomic_long_read(&c->writeback_keys_done));
 	sysfs_print(writeback_keys_failed,
@@ -731,6 +743,9 @@ static struct attribute *bch_cache_set_internal_files[] = {
 
 	&sysfs_bset_tree_stats,
 	&sysfs_cache_read_races,
+	&sysfs_reclaim,
+	&sysfs_flush_write,
+	&sysfs_retry_flush_write,
 	&sysfs_writeback_keys_done,
 	&sysfs_writeback_keys_failed,
 

From c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0 Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Wed, 7 Feb 2018 11:41:40 -0800
Subject: [PATCH 359/676] bcache: fix high CPU occupancy during journal

After long time small writing I/O running, we found the occupancy of CPU
is very high and I/O performance has been reduced by about half:

[root@ceph151 internal]# top
top - 15:51:05 up 1 day,2:43,  4 users,  load average: 16.89, 15.15, 16.53
Tasks: 2063 total,   4 running, 2059 sleeping,   0 stopped,   0 zombie
%Cpu(s):4.3 us, 17.1 sy 0.0 ni, 66.1 id, 12.0 wa,  0.0 hi,  0.5 si,  0.0 st
KiB Mem : 65450044 total, 24586420 free, 38909008 used,  1954616 buff/cache
KiB Swap: 65667068 total, 65667068 free,        0 used. 25136812 avail Mem

  PID USER PR NI    VIRT    RES    SHR S %CPU %MEM     TIME+ COMMAND
 2023 root 20  0       0      0      0 S 55.1  0.0   0:04.42 kworker/11:191
14126 root 20  0       0      0      0 S 42.9  0.0   0:08.72 kworker/10:3
 9292 root 20  0       0      0      0 S 30.4  0.0   1:10.99 kworker/6:1
 8553 ceph 20  0 4242492 1.805g  18804 S 30.0  2.9 410:07.04 ceph-osd
12287 root 20  0       0      0      0 S 26.7  0.0   0:28.13 kworker/7:85
31019 root 20  0       0      0      0 S 26.1  0.0   1:30.79 kworker/22:1
 1787 root 20  0       0      0      0 R 25.7  0.0   5:18.45 kworker/8:7
32169 root 20  0       0      0      0 S 14.5  0.0   1:01.92 kworker/23:1
21476 root 20  0       0      0      0 S 13.9  0.0   0:05.09 kworker/1:54
 2204 root 20  0       0      0      0 S 12.5  0.0   1:25.17 kworker/9:10
16994 root 20  0       0      0      0 S 12.2  0.0   0:06.27 kworker/5:106
15714 root 20  0       0      0      0 R 10.9  0.0   0:01.85 kworker/19:2
 9661 ceph 20  0 4246876 1.731g  18800 S 10.6  2.8 403:00.80 ceph-osd
11460 ceph 20  0 4164692 2.206g  18876 S 10.6  3.5 360:27.19 ceph-osd
 9960 root 20  0       0      0      0 S 10.2  0.0   0:02.75 kworker/2:139
11699 ceph 20  0 4169244 1.920g  18920 S 10.2  3.1 355:23.67 ceph-osd
 6843 ceph 20  0 4197632 1.810g  18900 S  9.6  2.9 380:08.30 ceph-osd

The kernel work consumed a lot of CPU, and I found they are running journal
work, The journal is reclaiming source and flush btree node with surprising
frequency.

Through further analysis, we found that in btree_flush_write(), we try to
get a btree node with the smallest fifo idex to flush by traverse all the
btree nodein c->bucket_hash, after we getting it, since no locker protects
it, this btree node may have been written to cache device by other works,
and if this occurred, we retry to traverse in c->bucket_hash and get
another btree node. When the problem occurrd, the retry times is very high,
and we consume a lot of CPU in looking for a appropriate btree node.

In this patch, we try to record 128 btree nodes with the smallest fifo idex
in heap, and pop one by one when we need to flush btree node. It greatly
reduces the time for the loop to find the appropriate BTREE node, and also
reduce the occupancy of CPU.

[note by mpl: this triggers a checkpatch error because of adjacent,
pre-existing style violations]

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h  |  2 ++
 drivers/md/bcache/journal.c | 47 +++++++++++++++++++++++++------------
 drivers/md/bcache/util.h    |  2 ++
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b98d7705acb67..a857eb3c10de6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -679,6 +679,8 @@ struct cache_set {
 
 #define BUCKET_HASH_BITS	12
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
+
+	DECLARE_HEAP(struct btree *, flush_btree);
 };
 
 struct bbio {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index f5296007a9d50..1b736b8607399 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -368,6 +368,12 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 }
 
 /* Journalling */
+#define journal_max_cmp(l, r) \
+	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
+	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
+#define journal_min_cmp(l, r) \
+	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
+	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
 
 static void btree_flush_write(struct cache_set *c)
 {
@@ -375,25 +381,35 @@ static void btree_flush_write(struct cache_set *c)
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
-	struct btree *b, *best;
-	unsigned i;
+	struct btree *b;
+	int i;
 
 	atomic_long_inc(&c->flush_write);
+
 retry:
-	best = NULL;
-
-	for_each_cached_btree(b, c, i)
-		if (btree_current_write(b)->journal) {
-			if (!best)
-				best = b;
-			else if (journal_pin_cmp(c,
-					btree_current_write(best)->journal,
-					btree_current_write(b)->journal)) {
-				best = b;
+	spin_lock(&c->journal.lock);
+	if (heap_empty(&c->flush_btree)) {
+		for_each_cached_btree(b, c, i)
+			if (btree_current_write(b)->journal) {
+				if (!heap_full(&c->flush_btree))
+					heap_add(&c->flush_btree, b,
+						 journal_max_cmp);
+				else if (journal_max_cmp(b,
+					 heap_peek(&c->flush_btree))) {
+					c->flush_btree.data[0] = b;
+					heap_sift(&c->flush_btree, 0,
+						  journal_max_cmp);
+				}
 			}
-		}
 
-	b = best;
+		for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
+			heap_sift(&c->flush_btree, i, journal_min_cmp);
+	}
+
+	b = NULL;
+	heap_pop(&c->flush_btree, b, journal_min_cmp);
+	spin_unlock(&c->journal.lock);
+
 	if (b) {
 		mutex_lock(&b->write_lock);
 		if (!btree_current_write(b)->journal) {
@@ -824,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c)
 	j->w[0].c = c;
 	j->w[1].c = c;
 
-	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
+	    !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
 		return -ENOMEM;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 4df4c5c1cab2e..a6763db7f061b 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -112,6 +112,8 @@ do {									\
 
 #define heap_full(h)	((h)->used == (h)->size)
 
+#define heap_empty(h)	((h)->used == 0)
+
 #define DECLARE_FIFO(type, name)					\
 	struct {							\
 		size_t front, back, size, mask;				\

From 99361bbf26337186f02561109c17a4c4b1a7536a Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Wed, 7 Feb 2018 11:41:41 -0800
Subject: [PATCH 360/676] bcache: properly set task state in
 bch_writeback_thread()

Kernel thread routine bch_writeback_thread() has the following code block,

447         down_write(&dc->writeback_lock);
448~450     if (check conditions) {
451                 up_write(&dc->writeback_lock);
452                 set_current_state(TASK_INTERRUPTIBLE);
453
454                 if (kthread_should_stop())
455                         return 0;
456
457                 schedule();
458                 continue;
459         }

If condition check is true, its task state is set to TASK_INTERRUPTIBLE
and call schedule() to wait for others to wake up it.

There are 2 issues in current code,
1, Task state is set to TASK_INTERRUPTIBLE after the condition checks, if
   another process changes the condition and call wake_up_process(dc->
   writeback_thread), then at line 452 task state is set back to
   TASK_INTERRUPTIBLE, the writeback kernel thread will lose a chance to be
   waken up.
2, At line 454 if kthread_should_stop() is true, writeback kernel thread
   will return to kernel/kthread.c:kthread() with TASK_INTERRUPTIBLE and
   call do_exit(). It is not good to enter do_exit() with task state
   TASK_INTERRUPTIBLE, in following code path might_sleep() is called and a
   warning message is reported by __might_sleep(): "WARNING: do not call
   blocking ops when !TASK_RUNNING; state=1 set at [xxxx]".

For the first issue, task state should be set before condition checks.
Ineed because dc->writeback_lock is required when modifying all the
conditions, calling set_current_state() inside code block where dc->
writeback_lock is hold is safe. But this is quite implicit, so I still move
set_current_state() before all the condition checks.

For the second issue, frankley speaking it does not hurt when kernel thread
exits with TASK_INTERRUPTIBLE state, but this warning message scares users,
makes them feel there might be something risky with bcache and hurt their
data.  Setting task state to TASK_RUNNING before returning fixes this
problem.

In alloc.c:allocator_wait(), there is also a similar issue, and is also
fixed in this patch.

Changelog:
v3: merge two similar fixes into one patch
v2: fix the race issue in v1 patch.
v1: initial buggy fix.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c     | 4 +++-
 drivers/md/bcache/writeback.c | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 6cc6c0f9c3a95..458e1d38577db 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -287,8 +287,10 @@ do {									\
 			break;						\
 									\
 		mutex_unlock(&(ca)->set->bucket_lock);			\
-		if (kthread_should_stop())				\
+		if (kthread_should_stop()) {				\
+			set_current_state(TASK_RUNNING);		\
 			return 0;					\
+		}							\
 									\
 		schedule();						\
 		mutex_lock(&(ca)->set->bucket_lock);			\
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 51306a19ab032..58218f7e77c3a 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg)
 
 	while (!kthread_should_stop()) {
 		down_write(&dc->writeback_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (!atomic_read(&dc->has_dirty) ||
 		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
 		     !dc->writeback_running)) {
 			up_write(&dc->writeback_lock);
-			set_current_state(TASK_INTERRUPTIBLE);
 
-			if (kthread_should_stop())
+			if (kthread_should_stop()) {
+				set_current_state(TASK_RUNNING);
 				return 0;
+			}
 
 			schedule();
 			continue;
 		}
+		set_current_state(TASK_RUNNING);
 
 		searched_full_index = refill_dirty(dc);
 

From 7ba0d830dc0e4c7e88602c91656029b6ae8a1766 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Wed, 7 Feb 2018 11:41:42 -0800
Subject: [PATCH 361/676] bcache: set error_limit correctly

Struct cache uses io_errors for two purposes,
- Error decay: when cache set error_decay is set, io_errors is used to
  generate a small piece of delay when I/O error happens.
- I/O errors counter: in order to generate big enough value for error
  decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a
  IO_ERROR_SHIFT).

In function bch_count_io_errors(), if I/O errors counter reaches cache set
error limit, bch_cache_set_error() will be called to retire the whold cache
set. But current code is problematic when checking the error limit, see the
following code piece from bch_count_io_errors(),

 90     if (error) {
 91             char buf[BDEVNAME_SIZE];
 92             unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
 93                                                 &ca->io_errors);
 94             errors >>= IO_ERROR_SHIFT;
 95
 96             if (errors < ca->set->error_limit)
 97                     pr_err("%s: IO error on %s, recovering",
 98                            bdevname(ca->bdev, buf), m);
 99             else
100                     bch_cache_set_error(ca->set,
101                                         "%s: too many IO errors %s",
102                                         bdevname(ca->bdev, buf), m);
103     }

At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real
errors counter to compare at line 96. But ca->set->error_limit is initia-
lized with an amplified value in bch_cache_set_alloc(),
1545         c->error_limit  = 8 << IO_ERROR_SHIFT;

It means by default, in bch_count_io_errors(), before 8<<20 errors happened
bch_cache_set_error() won't be called to retire the problematic cache
device. If the average request size is 64KB, it means bcache won't handle
failed device until 512GB data is requested. This is too large to be an I/O
threashold. So I believe the correct error limit should be much less.

This patch sets default cache set error limit to 8, then in
bch_count_io_errors() when errors counter reaches 8 (if it is default
value), function bch_cache_set_error() will be called to retire the whole
cache set. This patch also removes bits shifting when store or show
io_error_limit value via sysfs interface.

Nowadays most of SSDs handle internal flash failure automatically by LBA
address re-indirect mapping. If an I/O error can be observed by upper layer
code, it will be a notable error because that SSD can not re-indirect
map the problematic LBA address to an available flash block. This situation
indicates the whole SSD will be failed very soon. Therefore setting 8 as
the default io error limit value makes sense, it is enough for most of
cache devices.

Changelog:
v2: add reviewed-by from Hannes.
v1: initial version for review.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h | 1 +
 drivers/md/bcache/super.c  | 2 +-
 drivers/md/bcache/sysfs.c  | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index a857eb3c10de6..b8c2e1bef1f17 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -666,6 +666,7 @@ struct cache_set {
 		ON_ERROR_UNREGISTER,
 		ON_ERROR_PANIC,
 	}			on_error;
+#define DEFAULT_IO_ERROR_LIMIT 8
 	unsigned		error_limit;
 	unsigned		error_decay;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 133b81225ea9c..e29150ec17e7d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1553,7 +1553,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	c->congested_read_threshold_us	= 2000;
 	c->congested_write_threshold_us	= 20000;
-	c->error_limit	= 8 << IO_ERROR_SHIFT;
+	c->error_limit	= DEFAULT_IO_ERROR_LIMIT;
 
 	return c;
 err:
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 46e7a6b3e7c7d..c524305cc9a7c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -568,7 +568,7 @@ SHOW(__bch_cache_set)
 
 	/* See count_io_errors for why 88 */
 	sysfs_print(io_error_halflife,	c->error_decay * 88);
-	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
+	sysfs_print(io_error_limit,	c->error_limit);
 
 	sysfs_hprint(congested,
 		     ((uint64_t) bch_get_congested(c)) << 9);
@@ -668,7 +668,7 @@ STORE(__bch_cache_set)
 	}
 
 	if (attr == &sysfs_io_error_limit)
-		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+		c->error_limit = strtoul_or_return(buf);
 
 	/* See count_io_errors() for why 88 */
 	if (attr == &sysfs_io_error_halflife)

From 682811b3ce1a5a4e20d700939a9042f01dbc66c4 Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Wed, 7 Feb 2018 11:41:43 -0800
Subject: [PATCH 362/676] bcache: fix for allocator and register thread race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After long time running of random small IO writing,
I reboot the machine, and after the machine power on,
I found bcache got stuck, the stack is:
[root@ceph153 ~]# cat /proc/2510/task/*/stack
[<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache]
[<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache]
[<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache]
[<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache]
[<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache]
[<ffffffff810a631f>] kthread+0xcf/0xe0
[<ffffffff8164c318>] ret_from_fork+0x58/0x90
[<ffffffffffffffff>] 0xffffffffffffffff
[root@ceph153 ~]# cat /proc/2038/task/*/stack
[<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache]
[<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache]
[<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache]
[<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache]
[<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache]
[<ffffffff812f702f>] kobj_attr_store+0xf/0x20
[<ffffffff8125b216>] sysfs_write_file+0xc6/0x140
[<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0
[<ffffffff811e069f>] SyS_write+0x7f/0xe0
[<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1
The stack shows the register thread and allocator thread
were getting stuck when registering cache device.

I reboot the machine several times, the issue always
exsit in this machine.

I debug the code, and found the call trace as bellow:
register_bcache()
   ==>run_cache_set()
      ==>bch_journal_replay()
         ==>bch_btree_insert()
            ==>__bch_btree_map_nodes()
               ==>btree_insert_fn()
                  ==>btree_split() //node need split
                     ==>btree_check_reserve()
In btree_check_reserve(), It will check if there is enough buckets
of RESERVE_BTREE type, since allocator thread did not work yet, so
no buckets of RESERVE_BTREE type allocated, so the register thread
waits on c->btree_cache_wait, and goes to sleep.

Then the allocator thread initialized, the call trace is bellow:
bch_allocator_thread()
==>bch_prio_write()
   ==>bch_journal_meta()
      ==>bch_journal()
         ==>journal_wait_for_write()
In journal_wait_for_write(), It will check if journal is full by
journal_full(), but the long time random small IO writing
causes the exhaustion of journal buckets(journal.blocks_free=0),
In order to release the journal buckets,
the allocator calls btree_flush_write() to flush keys to
btree nodes, and waits on c->journal.wait until btree nodes writing
over or there has already some journal buckets space, then the
allocator thread goes to sleep. but in btree_flush_write(), since
bch_journal_replay() is not finished, so no btree nodes have journal
(condition "if (btree_current_write(b)->journal)" never satisfied),
so we got no btree node to flush, no journal bucket released,
and allocator sleep all the times.

Through the above analysis, we can see that:
1) Register thread wait for allocator thread to allocate buckets of
   RESERVE_BTREE type;
2) Alloctor thread wait for register thread to replay journal, so it
   can flush btree nodes and get journal bucket.
   then they are all got stuck by waiting for each other.

Hua Rui provided a patch for me, by allocating some buckets of
RESERVE_BTREE type in advance, so the register thread can get bucket
when btree node splitting and no need to waiting for the allocator
thread. I tested it, it has effect, and register thread run a step
forward, but finally are still got stuck, the reason is only 8 bucket
of RESERVE_BTREE type were allocated, and in bch_journal_replay(),
after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left,
then btree_check_reserve() is not satisfied anymore, so it goes to sleep
again, and in the same time, alloctor thread did not flush enough btree
nodes to release a journal bucket, so they all got stuck again.

So we need to allocate more buckets of RESERVE_BTREE type in advance,
but how much is enough?  By experience and test, I think it should be
as much as journal buckets. Then I modify the code as this patch,
and test in the machine, and it works.

This patch modified base on Hua Rui’s patch, and allocate more buckets
of RESERVE_BTREE type in advance to avoid register thread and allocate
thread going to wait for each other.

[patch v2] ca->sb.njournal_buckets would be 0 in the first time after
cache creation, and no journal exists, so just 8 btree buckets is OK.

Signed-off-by: Hua Rui <huarui.dev@gmail.com>
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.c |  9 ++++++---
 drivers/md/bcache/super.c | 13 ++++++++++++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bf3a48aa9a9a4..fad9fe8817eb1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c)
 	 */
 	for_each_cache(ca, c, i) {
 		for_each_bucket(b, ca) {
-			if (fifo_full(&ca->free[RESERVE_PRIO]))
+			if (fifo_full(&ca->free[RESERVE_PRIO]) &&
+			    fifo_full(&ca->free[RESERVE_BTREE]))
 				break;
 
 			if (bch_can_invalidate_bucket(ca, b) &&
 			    !GC_MARK(b)) {
 				__bch_invalidate_one_bucket(ca, b);
-				fifo_push(&ca->free[RESERVE_PRIO],
-					  b - ca->buckets);
+				if (!fifo_push(&ca->free[RESERVE_PRIO],
+				   b - ca->buckets))
+					fifo_push(&ca->free[RESERVE_BTREE],
+						  b - ca->buckets);
 			}
 		}
 	}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e29150ec17e7d..a2ad37a8afc05 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1833,6 +1833,7 @@ void bch_cache_release(struct kobject *kobj)
 static int cache_alloc(struct cache *ca)
 {
 	size_t free;
+	size_t btree_buckets;
 	struct bucket *b;
 
 	__module_get(THIS_MODULE);
@@ -1840,9 +1841,19 @@ static int cache_alloc(struct cache *ca)
 
 	bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
 
+	/*
+	 * when ca->sb.njournal_buckets is not zero, journal exists,
+	 * and in bch_journal_replay(), tree node may split,
+	 * so bucket of RESERVE_BTREE type is needed,
+	 * the worst situation is all journal buckets are valid journal,
+	 * and all the keys need to replay,
+	 * so the number of  RESERVE_BTREE type buckets should be as much
+	 * as journal buckets
+	 */
+	btree_buckets = ca->sb.njournal_buckets ?: 8;
 	free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
 
-	if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
+	if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
 	    !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||

From 7a5e3ecbe5b7b58e9a78a3738b28244982822e1c Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Wed, 7 Feb 2018 11:41:44 -0800
Subject: [PATCH 363/676] bcache: set writeback_rate_update_seconds in range
 [1, 60] seconds

dc->writeback_rate_update_seconds can be set via sysfs and its value can
be set to [1, ULONG_MAX].  It does not make sense to set such a large
value, 60 seconds is long enough value considering the default 5 seconds
works well for long time.

Because dc->writeback_rate_update is a special delayed work, it re-arms
itself inside the delayed work routine update_writeback_rate(). When
stopping it by cancel_delayed_work_sync(), there should be a timeout to
wait and make sure the re-armed delayed work is stopped too. A small max
value of dc->writeback_rate_update_seconds is also helpful to decide a
reasonable small timeout.

This patch limits sysfs interface to set dc->writeback_rate_update_seconds
in range of [1, 60] seconds, and replaces the hand-coded number by macros.

Changelog:
v2: fix a rebase typo in v4, which is pointed out by Michael Lyle.
v1: initial version.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c     | 4 +++-
 drivers/md/bcache/writeback.c | 2 +-
 drivers/md/bcache/writeback.h | 3 +++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c524305cc9a7c..4a6a697e1680d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -218,7 +218,9 @@ STORE(__cached_dev)
 	sysfs_strtoul_clamp(writeback_rate,
 			    dc->writeback_rate.rate, 1, INT_MAX);
 
-	d_strtoul_nonzero(writeback_rate_update_seconds);
+	sysfs_strtoul_clamp(writeback_rate_update_seconds,
+			    dc->writeback_rate_update_seconds,
+			    1, WRITEBACK_RATE_UPDATE_SECS_MAX);
 	d_strtoul(writeback_rate_i_term_inverse);
 	d_strtoul_nonzero(writeback_rate_p_term_inverse);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 58218f7e77c3a..f1d2fc15abcc0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -655,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate.rate		= 1024;
 	dc->writeback_rate_minimum	= 8;
 
-	dc->writeback_rate_update_seconds = 5;
+	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
 	dc->writeback_rate_p_term_inverse = 40;
 	dc->writeback_rate_i_term_inverse = 10000;
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 66f1c527fa243..587b255998568 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -8,6 +8,9 @@
 #define MAX_WRITEBACKS_IN_PASS  5
 #define MAX_WRITESIZE_IN_PASS   5000	/* *512b */
 
+#define WRITEBACK_RATE_UPDATE_SECS_MAX		60
+#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT	5
+
 /*
  * 14 (16384ths) is chosen here as something that each backing device
  * should be a reasonable fraction of the share, and not to blow up

From 7f4fc93d4713394ee8f1cd44c238e046e11b4f15 Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Wed, 7 Feb 2018 11:41:45 -0800
Subject: [PATCH 364/676] bcache: return attach error when no cache set exist

I attach a back-end device to a cache set, and the cache set is not
registered yet, this back-end device did not attach successfully, and no
error returned:
[root]# echo 87859280-fec6-4bcc-20df7ca8f86b > /sys/block/sde/bcache/attach
[root]#

In sysfs_attach(), the return value "v" is initialized to "size" in
the beginning, and if no cache set exist in bch_cache_sets, the "v" value
would not change any more, and return to sysfs, sysfs regard it as success
since the "size" is a positive number.

This patch fixes this issue by assigning "v" with "-ENOENT" in the
initialization.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4a6a697e1680d..a7552f509f50e 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -198,7 +198,7 @@ STORE(__cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
-	ssize_t v = size;
+	ssize_t v;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
 
@@ -275,6 +275,7 @@ STORE(__cached_dev)
 		if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
 			return -EINVAL;
 
+		v = -ENOENT;
 		list_for_each_entry(c, &bch_cache_sets, list) {
 			v = bch_cached_dev_attach(dc, c);
 			if (!v)
@@ -282,7 +283,7 @@ STORE(__cached_dev)
 		}
 
 		pr_err("Can't attach %s: cache set not found", buf);
-		size = v;
+		return v;
 	}
 
 	if (attr == &sysfs_detach && dc->disk.c)

From 73ac105be390c1de42a2f21643c9778a5e002930 Mon Sep 17 00:00:00 2001
From: Tang Junhui <tang.junhui@zte.com.cn>
Date: Wed, 7 Feb 2018 11:41:46 -0800
Subject: [PATCH 365/676] bcache: fix for data collapse after re-attaching an
 attached device

back-end device sdm has already attached a cache_set with ID
f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with
another cache set, and it returns with an error:
[root]# cd /sys/block/sdm/bcache
[root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach
-bash: echo: write error: Invalid argument

After that, execute a command to modify the label of bcache
device:
[root]# echo data_disk1 > label

Then we reboot the system, when the system power on, the back-end
device can not attach to cache_set, a messages show in the log:
Feb  5 12:05:52 ceph152 kernel: [922385.508498] bcache:
bch_cached_dev_attach() couldn't find uuid for sdm in set

In sysfs_attach(), dc->sb.set_uuid was assigned to the value
which input through sysfs, no matter whether it is success
or not in bch_cached_dev_attach(). For example, If the back-end
device has already attached to an cache set, bch_cached_dev_attach()
would fail, but dc->sb.set_uuid was changed. Then modify the
label of bcache device, it will call bch_write_bdev_super(),
which would write the dc->sb.set_uuid to the super block, so we
record a wrong cache set ID in the super block, after the system
reboot, the cache set couldn't find the uuid of the back-end
device, so the bcache device couldn't exist and use any more.

In this patch, we don't assigned cache set ID to dc->sb.set_uuid
in sysfs_attach() directly, but input it into bch_cached_dev_attach(),
and assigned dc->sb.set_uuid to the cache set ID after the back-end
device attached to the cache set successful.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h |  2 +-
 drivers/md/bcache/super.c  | 10 ++++++----
 drivers/md/bcache/sysfs.c  |  6 ++++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b8c2e1bef1f17..12e5197f186cd 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -924,7 +924,7 @@ void bcache_write_super(struct cache_set *);
 
 int bch_flash_dev_create(struct cache_set *c, uint64_t size);
 
-int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *);
 void bch_cached_dev_detach(struct cached_dev *);
 void bch_cached_dev_run(struct cached_dev *);
 void bcache_device_stop(struct bcache_device *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a2ad37a8afc05..3128957880367 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 	cached_dev_put(dc);
 }
 
-int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
+			  uint8_t *set_uuid)
 {
 	uint32_t rtime = cpu_to_le32(get_seconds());
 	struct uuid_entry *u;
@@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 
 	bdevname(dc->bdev, buf);
 
-	if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+	if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
+	    (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
 		return -ENOENT;
 
 	if (dc->disk.c) {
@@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
 
 	list_add(&dc->list, &uncached_devices);
 	list_for_each_entry(c, &bch_cache_sets, list)
-		bch_cached_dev_attach(dc, c);
+		bch_cached_dev_attach(dc, c, NULL);
 
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
 	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
@@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c)
 	bcache_write_super(c);
 
 	list_for_each_entry_safe(dc, t, &uncached_devices, list)
-		bch_cached_dev_attach(dc, c);
+		bch_cached_dev_attach(dc, c, NULL);
 
 	flash_devs_run(c);
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index a7552f509f50e..78cd7bd50fddd 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -272,12 +272,14 @@ STORE(__cached_dev)
 	}
 
 	if (attr == &sysfs_attach) {
-		if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16)
+		uint8_t		set_uuid[16];
+
+		if (bch_parse_uuid(buf, set_uuid) < 16)
 			return -EINVAL;
 
 		v = -ENOENT;
 		list_for_each_entry(c, &bch_cache_sets, list) {
-			v = bch_cached_dev_attach(dc, c);
+			v = bch_cached_dev_attach(dc, c, set_uuid);
 			if (!v)
 				return size;
 		}

From a7877390614770965a6925dfed79cbd3eeeb61e0 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 7 Feb 2018 22:19:20 +0100
Subject: [PATCH 366/676] block, bfq: add requeue-request hook

Commit 'a6a252e64914 ("blk-mq-sched: decide how to handle flush rq via
RQF_FLUSH_SEQ")' makes all non-flush re-prepared requests for a device
be re-inserted into the active I/O scheduler for that device. As a
consequence, I/O schedulers may get the same request inserted again,
even several times, without a finish_request invoked on that request
before each re-insertion.

This fact is the cause of the failure reported in [1]. For an I/O
scheduler, every re-insertion of the same re-prepared request is
equivalent to the insertion of a new request. For schedulers like
mq-deadline or kyber, this fact causes no harm. In contrast, it
confuses a stateful scheduler like BFQ, which keeps state for an I/O
request, until the finish_request hook is invoked on the request. In
particular, BFQ may get stuck, waiting forever for the number of
request dispatches, of the same request, to be balanced by an equal
number of request completions (while there will be one completion for
that request). In this state, BFQ may refuse to serve I/O requests
from other bfq_queues. The hang reported in [1] then follows.

However, the above re-prepared requests undergo a requeue, thus the
requeue_request hook of the active elevator is invoked for these
requests, if set. This commit then addresses the above issue by
properly implementing the hook requeue_request in BFQ.

[1] https://marc.info/?l=linux-block&m=151211117608676

Reported-by: Ivan Kozik <ivan@ludios.org>
Reported-by: Alban Browaeys <alban.browaeys@gmail.com>
Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Serena Ziviani <ziviani.serena@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c | 107 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 82 insertions(+), 25 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 47e6ec7427c44..aeca22d911010 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3823,24 +3823,26 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 		}
 
 		/*
-		 * We exploit the bfq_finish_request hook to decrement
-		 * rq_in_driver, but bfq_finish_request will not be
-		 * invoked on this request. So, to avoid unbalance,
-		 * just start this request, without incrementing
-		 * rq_in_driver. As a negative consequence,
-		 * rq_in_driver is deceptively lower than it should be
-		 * while this request is in service. This may cause
-		 * bfq_schedule_dispatch to be invoked uselessly.
+		 * We exploit the bfq_finish_requeue_request hook to
+		 * decrement rq_in_driver, but
+		 * bfq_finish_requeue_request will not be invoked on
+		 * this request. So, to avoid unbalance, just start
+		 * this request, without incrementing rq_in_driver. As
+		 * a negative consequence, rq_in_driver is deceptively
+		 * lower than it should be while this request is in
+		 * service. This may cause bfq_schedule_dispatch to be
+		 * invoked uselessly.
 		 *
 		 * As for implementing an exact solution, the
-		 * bfq_finish_request hook, if defined, is probably
-		 * invoked also on this request. So, by exploiting
-		 * this hook, we could 1) increment rq_in_driver here,
-		 * and 2) decrement it in bfq_finish_request. Such a
-		 * solution would let the value of the counter be
-		 * always accurate, but it would entail using an extra
-		 * interface function. This cost seems higher than the
-		 * benefit, being the frequency of non-elevator-private
+		 * bfq_finish_requeue_request hook, if defined, is
+		 * probably invoked also on this request. So, by
+		 * exploiting this hook, we could 1) increment
+		 * rq_in_driver here, and 2) decrement it in
+		 * bfq_finish_requeue_request. Such a solution would
+		 * let the value of the counter be always accurate,
+		 * but it would entail using an extra interface
+		 * function. This cost seems higher than the benefit,
+		 * being the frequency of non-elevator-private
 		 * requests very low.
 		 */
 		goto start_rq;
@@ -4515,6 +4517,8 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 					   unsigned int cmd_flags) {}
 #endif
 
+static void bfq_prepare_request(struct request *rq, struct bio *bio);
+
 static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       bool at_head)
 {
@@ -4541,6 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		else
 			list_add_tail(&rq->queuelist, &bfqd->dispatch);
 	} else {
+		if (WARN_ON_ONCE(!bfqq)) {
+			/*
+			 * This should never happen. Most likely rq is
+			 * a requeued regular request, being
+			 * re-inserted without being first
+			 * re-prepared. Do a prepare, to avoid
+			 * failure.
+			 */
+			bfq_prepare_request(rq, rq->bio);
+			bfqq = RQ_BFQQ(rq);
+		}
+
 		idle_timer_disabled = __bfq_insert_request(bfqd, rq);
 		/*
 		 * Update bfqq, because, if a queue merge has occurred
@@ -4697,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 		bfq_schedule_dispatch(bfqd);
 }
 
-static void bfq_finish_request_body(struct bfq_queue *bfqq)
+static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
 {
 	bfqq->allocated--;
 
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_finish_request(struct request *rq)
+/*
+ * Handle either a requeue or a finish for rq. The things to do are
+ * the same in both cases: all references to rq are to be dropped. In
+ * particular, rq is considered completed from the point of view of
+ * the scheduler.
+ */
+static void bfq_finish_requeue_request(struct request *rq)
 {
-	struct bfq_queue *bfqq;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd;
 
-	if (!rq->elv.icq)
+	/*
+	 * Requeue and finish hooks are invoked in blk-mq without
+	 * checking whether the involved request is actually still
+	 * referenced in the scheduler. To handle this fact, the
+	 * following two checks make this function exit in case of
+	 * spurious invocations, for which there is nothing to do.
+	 *
+	 * First, check whether rq has nothing to do with an elevator.
+	 */
+	if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
+		return;
+
+	/*
+	 * rq either is not associated with any icq, or is an already
+	 * requeued request that has not (yet) been re-inserted into
+	 * a bfq_queue.
+	 */
+	if (!rq->elv.icq || !bfqq)
 		return;
 
-	bfqq = RQ_BFQQ(rq);
 	bfqd = bfqq->bfqd;
 
 	if (rq->rq_flags & RQF_STARTED)
@@ -4727,13 +4765,14 @@ static void bfq_finish_request(struct request *rq)
 		spin_lock_irqsave(&bfqd->lock, flags);
 
 		bfq_completed_request(bfqq, bfqd);
-		bfq_finish_request_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 
 		spin_unlock_irqrestore(&bfqd->lock, flags);
 	} else {
 		/*
 		 * Request rq may be still/already in the scheduler,
-		 * in which case we need to remove it. And we cannot
+		 * in which case we need to remove it (this should
+		 * never happen in case of requeue). And we cannot
 		 * defer such a check and removal, to avoid
 		 * inconsistencies in the time interval from the end
 		 * of this function to the start of the deferred work.
@@ -4748,9 +4787,26 @@ static void bfq_finish_request(struct request *rq)
 			bfqg_stats_update_io_remove(bfqq_group(bfqq),
 						    rq->cmd_flags);
 		}
-		bfq_finish_request_body(bfqq);
+		bfq_finish_requeue_request_body(bfqq);
 	}
 
+	/*
+	 * Reset private fields. In case of a requeue, this allows
+	 * this function to correctly do nothing if it is spuriously
+	 * invoked again on this same request (see the check at the
+	 * beginning of the function). Probably, a better general
+	 * design would be to prevent blk-mq from invoking the requeue
+	 * or finish hooks of an elevator, for a request that is not
+	 * referred by that elevator.
+	 *
+	 * Resetting the following fields would break the
+	 * request-insertion logic if rq is re-inserted into a bfq
+	 * internal queue, without a re-preparation. Here we assume
+	 * that re-insertions of requeued requests, without
+	 * re-preparation, can happen only for pass_through or at_head
+	 * requests (which are not re-inserted into bfq internal
+	 * queues).
+	 */
 	rq->elv.priv[0] = NULL;
 	rq->elv.priv[1] = NULL;
 }
@@ -5426,7 +5482,8 @@ static struct elevator_type iosched_bfq_mq = {
 	.ops.mq = {
 		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
-		.finish_request		= bfq_finish_request,
+		.requeue_request        = bfq_finish_requeue_request,
+		.finish_request		= bfq_finish_requeue_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,

From 035d808f7cfd578dc210b584beb17e365f34d39a Mon Sep 17 00:00:00 2001
From: Naresh Kamboju <naresh.kamboju@linaro.org>
Date: Wed, 7 Feb 2018 23:45:34 +0530
Subject: [PATCH 367/676] selftests: bpf: test_kmod.sh: check the module path
 before insmod

test_kmod.sh reported false failure when module not present.
Check test_bpf.ko is present in the path before loading it.

Two cases to be addressed here,
In the development process of test_bpf.c unit testing will be done by
developers by using "insmod $SRC_TREE/lib/test_bpf.ko"

On the other hand testers run full tests by installing modules on device
under test (DUT) and followed by modprobe to insert the modules accordingly.

Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_kmod.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
index ed4774d8d6ed1..35669ccd4d23b 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -10,9 +10,21 @@ test_run()
 
 	echo "[ JIT enabled:$1 hardened:$2 ]"
 	dmesg -C
-	insmod $SRC_TREE/lib/test_bpf.ko 2> /dev/null
-	if [ $? -ne 0 ]; then
-		rc=1
+	if [ -f ${SRC_TREE}/lib/test_bpf.ko ]; then
+		insmod ${SRC_TREE}/lib/test_bpf.ko 2> /dev/null
+		if [ $? -ne 0 ]; then
+			rc=1
+		fi
+	else
+		# Use modprobe dry run to check for missing test_bpf module
+		if ! /sbin/modprobe -q -n test_bpf; then
+			echo "test_bpf: [SKIP]"
+		elif /sbin/modprobe -q test_bpf; then
+			echo "test_bpf: ok"
+		else
+			echo "test_bpf: [FAIL]"
+			rc=1
+		fi
 	fi
 	rmmod  test_bpf 2> /dev/null
 	dmesg | grep FAIL

From 90ea9f1b60c679049619a79d9fc1557bc41c4973 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 6 Feb 2018 07:58:49 -0500
Subject: [PATCH 368/676] Make the xprtiod workqueue unbounded.

This should help reduce the latency on replies.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 96fdf6011c539..25e6051e97f29 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1109,7 +1109,7 @@ static int rpciod_start(void)
 		goto out_failed;
 	rpciod_workqueue = wq;
 	/* Note: highpri because network receive is latency sensitive */
-	wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	wq = alloc_workqueue("xprtiod", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_HIGHPRI, 0);
 	if (!wq)
 		goto free_rpciod;
 	xprtiod_workqueue = wq;

From e729452ec33304260d4741e9ce67784a457ee1e6 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Tue, 6 Feb 2018 21:17:17 +0100
Subject: [PATCH 369/676] cxgb4: Fix error handling path in 'init_one()'

Commit baf5086840ab1 ("cxgb4: restructure VF mgmt code") has reordered
some code but an error handling label has not been updated accordingly.
So fix it and free 'adapter' if 't4_wait_dev_ready()' fails.

Fixes: baf5086840ab1 ("cxgb4: restructure VF mgmt code")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 1ca2a39ed0f85..56bc626ef0068 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5166,7 +5166,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->regs = regs;
 	err = t4_wait_dev_ready(regs);
 	if (err < 0)
-		goto out_unmap_bar0;
+		goto out_free_adapter;
 
 	/* We control everything through one PF */
 	whoami = readl(regs + PL_WHOAMI_A);

From 17e9e23b130e4e269fa53c2370325249f3ba75dd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Feb 2018 16:44:12 +0000
Subject: [PATCH 370/676] rxrpc: Fix received abort handling

AF_RXRPC is incorrectly sending back to the server any abort it receives
for a client connection.  This is due to the final-ACK offload to the
connection event processor patch.  The abort code is copied into the
last-call information on the connection channel and then the event
processor is set.

Instead, the following should be done:

 (1) In the case of a final-ACK for a successful call, the ACK should be
     scheduled as before.

 (2) In the case of a locally generated ABORT, the ABORT details should be
     cached for sending in response to further packets related to that
     call and no further action scheduled at call disconnect time.

 (3) In the case of an ACK received from the peer, the call should be
     considered dead, no ABORT should be transmitted at this time.  In
     response to further non-ABORT packets from the peer relating to this
     call, an RX_USER_ABORT ABORT should be transmitted.

 (4) In the case of a call killed due to network error, an RX_USER_ABORT
     ABORT should be cached for transmission in response to further
     packets, but no ABORT should be sent at this time.

Fixes: 3136ef49a14c ("rxrpc: Delay terminal ACK transmission on a client call")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/conn_client.c |  3 ++-
 net/rxrpc/conn_object.c | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 7f74ca3059f8f..0641750680593 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -834,7 +834,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	 * can be skipped if we find a follow-on call.  The first DATA packet
 	 * of the follow on call will implicitly ACK this call.
 	 */
-	if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
+	if (call->completion == RXRPC_CALL_SUCCEEDED &&
+	    test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
 		unsigned long final_ack_at = jiffies + 2;
 
 		WRITE_ONCE(chan->final_ack_at, final_ack_at);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index c628351eb9008..ccbac190add1f 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -177,13 +177,21 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn,
 		 * through the channel, whilst disposing of the actual call record.
 		 */
 		trace_rxrpc_disconnect_call(call);
-		if (call->abort_code) {
-			chan->last_abort = call->abort_code;
-			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
-		} else {
+		switch (call->completion) {
+		case RXRPC_CALL_SUCCEEDED:
 			chan->last_seq = call->rx_hard_ack;
 			chan->last_type = RXRPC_PACKET_TYPE_ACK;
+			break;
+		case RXRPC_CALL_LOCALLY_ABORTED:
+			chan->last_abort = call->abort_code;
+			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+			break;
+		default:
+			chan->last_abort = RX_USER_ABORT;
+			chan->last_type = RXRPC_PACKET_TYPE_ABORT;
+			break;
 		}
+
 		/* Sync with rxrpc_conn_retransmit(). */
 		smp_wmb();
 		chan->last_call = chan->call_id;

From c861ef83d771362ed0475cd510eb56cf4126ef34 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Tue, 6 Feb 2018 11:34:23 -0800
Subject: [PATCH 371/676] sun: Add SPDX license tags to Sun network drivers

Add the appropriate SPDX license tags to the Sun network drivers
as outlined in Documentation/process/license-rules.rst.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sun/Kconfig          | 1 +
 drivers/net/ethernet/sun/cassini.c        | 1 +
 drivers/net/ethernet/sun/cassini.h        | 1 +
 drivers/net/ethernet/sun/ldmvsw.c         | 1 +
 drivers/net/ethernet/sun/niu.c            | 1 +
 drivers/net/ethernet/sun/sunbmac.c        | 1 +
 drivers/net/ethernet/sun/sungem.c         | 1 +
 drivers/net/ethernet/sun/sunhme.c         | 1 +
 drivers/net/ethernet/sun/sunqe.c          | 1 +
 drivers/net/ethernet/sun/sunvnet.c        | 1 +
 drivers/net/ethernet/sun/sunvnet_common.c | 1 +
 11 files changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/sun/Kconfig b/drivers/net/ethernet/sun/Kconfig
index b2caf5132bd2b..7b982e02ea3a4 100644
--- a/drivers/net/ethernet/sun/Kconfig
+++ b/drivers/net/ethernet/sun/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
 #
 # Sun network device configuration
 #
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 113bd57e2ea04..9020b084b9538 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* cassini.c: Sun Microsystems Cassini(+) ethernet driver.
  *
  * Copyright (C) 2004 Sun Microsystems Inc.
diff --git a/drivers/net/ethernet/sun/cassini.h b/drivers/net/ethernet/sun/cassini.h
index 882ce168a799b..13f3860496a86 100644
--- a/drivers/net/ethernet/sun/cassini.h
+++ b/drivers/net/ethernet/sun/cassini.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /* $Id: cassini.h,v 1.16 2004/08/17 21:15:16 zaumen Exp $
  * cassini.h: Definitions for Sun Microsystems Cassini(+) ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/ldmvsw.c b/drivers/net/ethernet/sun/ldmvsw.c
index 5ea037672e6f2..a5dd627fe2f92 100644
--- a/drivers/net/ethernet/sun/ldmvsw.c
+++ b/drivers/net/ethernet/sun/ldmvsw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* ldmvsw.c: Sun4v LDOM Virtual Switch Driver.
  *
  * Copyright (C) 2016-2017 Oracle. All rights reserved.
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 06001bacbe0fe..8dd545fed30d2 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* niu.c: Neptune ethernet driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller (davem@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c
index 0b1f41f6bceba..f047b27971564 100644
--- a/drivers/net/ethernet/sun/sunbmac.c
+++ b/drivers/net/ethernet/sun/sunbmac.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunbmac.c: Driver for Sparc BigMAC 100baseT ethernet adapters.
  *
  * Copyright (C) 1997, 1998, 1999, 2003, 2008 David S. Miller (davem@davemloft.net)
diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index a7afcee3c5ae6..7a16d40a72d13 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* $Id: sungem.c,v 1.44.2.22 2002/03/13 01:18:12 davem Exp $
  * sungem.c: Sun GEM ethernet driver.
  *
diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index 0431f1e5f5112..06da2f59fcbff 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunhme.c: Sparc HME/BigMac 10/100baseT half/full duplex auto switching,
  *           auto carrier detecting ethernet driver.  Also known as the
  *           "Happy Meal Ethernet" found on SunSwift SBUS cards.
diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c
index a6bcdcdd947e3..7fe0d5e339221 100644
--- a/drivers/net/ethernet/sun/sunqe.c
+++ b/drivers/net/ethernet/sun/sunqe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunqe.c: Sparc QuadEthernet 10baseT SBUS card driver.
  *          Once again I am out to prove that every ethernet
  *          controller out there can be most efficiently programmed
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 27fb226388852..63d3d6b215f30 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c
index 8aa3ce46bb81e..d8f4c3f281505 100644
--- a/drivers/net/ethernet/sun/sunvnet_common.c
+++ b/drivers/net/ethernet/sun/sunvnet_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /* sunvnet.c: Sun LDOM Virtual Network Driver.
  *
  * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>

From 58e354c01b1906b603dcdb28ec35250b09eac625 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 6 Feb 2018 12:14:12 -0800
Subject: [PATCH 372/676] net/ipv6: Handle reject routes with onlink flag

Verification of nexthops with onlink flag need to handle unreachable
routes. The lookup is only intended to validate the gateway address
is not a local address and if the gateway resolves the egress device
must match the given device. Hence, hitting any default reject route
is ok.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fb2d251c05003..69c43d289c699 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2488,7 +2488,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
 	err = 0;
 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
 	if (grt) {
-		if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
+		if (!grt->dst.error &&
+		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
 			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
 			err = -EINVAL;
 		}

From 44750f8483a1bc4bda58ec4f7b7af5b053833d49 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 6 Feb 2018 13:17:06 -0800
Subject: [PATCH 373/676] net/ipv6: onlink nexthop checks should default to
 main table

Because of differences in how ipv4 and ipv6 handle fib lookups,
verification of nexthops with onlink flag need to default to the main
table rather than the local table used by IPv4. As it stands an
address within a connected route on device 1 can be used with
onlink on device 2. Updating the table properly rejects the route
due to the egress device mismatch.

Update the extack message as well to show it could be a device
mismatch for the nexthop spec.

Fixes: fc1e64e1092f ("net/ipv6: Add support for onlink flag")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 69c43d289c699..9dcfadddd8005 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2479,7 +2479,7 @@ static int ip6_route_check_nh_onlink(struct net *net,
 				     struct net_device *dev,
 				     struct netlink_ext_ack *extack)
 {
-	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
+	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
 	struct rt6_info *grt;
@@ -2490,7 +2490,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
 	if (grt) {
 		if (!grt->dst.error &&
 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
-			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+			NL_SET_ERR_MSG(extack,
+				       "Nexthop has invalid gateway or device mismatch");
 			err = -EINVAL;
 		}
 

From bc6d33c8d93f5999920e97a8c6330b8910053d4f Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Tue, 6 Feb 2018 13:15:20 -0800
Subject: [PATCH 374/676] i40e: Fix the number of queues available to be mapped
 for use

Fix the number of queues per enabled TC and report available queues
to the kernel without having to limit them to the max RSS limit so
they are available to be mapped for XPS. This allows a queue per
processing thread available for handling traffic for the given
traffic class.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 27 +++++++++++----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f95ce9b5e4fbe..e31adbc75f9cc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1785,7 +1785,7 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	struct i40e_pf *pf = vsi->back;
 	u16 sections = 0;
 	u8 netdev_tc = 0;
-	u16 numtc = 0;
+	u16 numtc = 1;
 	u16 qcount;
 	u8 offset;
 	u16 qmap;
@@ -1795,9 +1795,11 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 	sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID;
 	offset = 0;
 
+	/* Number of queues per enabled TC */
+	num_tc_qps = vsi->alloc_queue_pairs;
 	if (enabled_tc && (vsi->back->flags & I40E_FLAG_DCB_ENABLED)) {
 		/* Find numtc from enabled TC bitmap */
-		for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
+		for (i = 0, numtc = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
 			if (enabled_tc & BIT(i)) /* TC is enabled */
 				numtc++;
 		}
@@ -1805,18 +1807,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 			dev_warn(&pf->pdev->dev, "DCB is enabled but no TC enabled, forcing TC0\n");
 			numtc = 1;
 		}
-	} else {
-		/* At least TC0 is enabled in non-DCB, non-MQPRIO case */
-		numtc = 1;
+		num_tc_qps = num_tc_qps / numtc;
+		num_tc_qps = min_t(int, num_tc_qps,
+				   i40e_pf_get_max_q_per_tc(pf));
 	}
 
 	vsi->tc_config.numtc = numtc;
 	vsi->tc_config.enabled_tc = enabled_tc ? enabled_tc : 1;
-	/* Number of queues per enabled TC */
-	qcount = vsi->alloc_queue_pairs;
-
-	num_tc_qps = qcount / numtc;
-	num_tc_qps = min_t(int, num_tc_qps, i40e_pf_get_max_q_per_tc(pf));
 
 	/* Do not allow use more TC queue pairs than MSI-X vectors exist */
 	if (pf->flags & I40E_FLAG_MSIX_ENABLED)
@@ -1831,9 +1828,13 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
 
 			switch (vsi->type) {
 			case I40E_VSI_MAIN:
-				qcount = min_t(int, pf->alloc_rss_size,
-					       num_tc_qps);
-				break;
+				if (!(pf->flags & (I40E_FLAG_FD_SB_ENABLED |
+				    I40E_FLAG_FD_ATR_ENABLED)) ||
+				    vsi->tc_config.enabled_tc != 1) {
+					qcount = min_t(int, pf->alloc_rss_size,
+						       num_tc_qps);
+					break;
+				}
 			case I40E_VSI_FDIR:
 			case I40E_VSI_SRIOV:
 			case I40E_VSI_VMDQ2:

From 3468656fd7599b0cb1092bb1ee717d1a984e93ee Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Tue, 6 Feb 2018 16:21:49 -0600
Subject: [PATCH 375/676] ibmvnic: Fix rx queue cleanup for non-fatal resets

At some point, a check was added to exit the polling routine during resets.
This makes sense for most reset conditions, but for a non-fatal error, we
expect the polling routine to continue running to properly clean up the rx
queues. This patch checks if we are performing a non-fatal reset and if we
are, continues normal polling operation.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index afaf29b201dc6..8dc8fc5037504 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1831,7 +1831,8 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
 		u16 offset;
 		u8 flags = 0;
 
-		if (unlikely(adapter->resetting)) {
+		if (unlikely(adapter->resetting &&
+			     adapter->reset_reason != VNIC_RESET_NON_FATAL)) {
 			enable_scrq_irq(adapter, adapter->rx_scrq[scrq_num]);
 			napi_complete_done(napi, frames_processed);
 			return frames_processed;

From b0992eca00c490c0923044b7d8b853c212b3cacc Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Tue, 6 Feb 2018 17:25:23 -0600
Subject: [PATCH 376/676] ibmvnic: Ensure that buffers are NULL after free

This change will guard against a double free in the case that the
buffers were previously freed at some other time, such as during
a device reset. It resolves a kernel oops that occurred when changing
the VNIC device's MTU.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 8dc8fc5037504..8dabc9d9dfa60 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -354,6 +354,8 @@ static void release_stats_buffers(struct ibmvnic_adapter *adapter)
 {
 	kfree(adapter->tx_stats_buffers);
 	kfree(adapter->rx_stats_buffers);
+	adapter->tx_stats_buffers = NULL;
+	adapter->rx_stats_buffers = NULL;
 }
 
 static int init_stats_buffers(struct ibmvnic_adapter *adapter)
@@ -599,6 +601,8 @@ static void release_vpd_data(struct ibmvnic_adapter *adapter)
 
 	kfree(adapter->vpd->buff);
 	kfree(adapter->vpd);
+
+	adapter->vpd = NULL;
 }
 
 static void release_tx_pools(struct ibmvnic_adapter *adapter)
@@ -909,6 +913,7 @@ static int ibmvnic_get_vpd(struct ibmvnic_adapter *adapter)
 	if (dma_mapping_error(dev, adapter->vpd->dma_addr)) {
 		dev_err(dev, "Could not map VPD buffer\n");
 		kfree(adapter->vpd->buff);
+		adapter->vpd->buff = NULL;
 		return -ENOMEM;
 	}
 

From 62f94c2101f35cd45775df00ba09bde77580e26a Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 6 Feb 2018 19:17:06 -0600
Subject: [PATCH 377/676] net: ethernet: ti: cpsw: fix net watchdog timeout

It was discovered that simple program which indefinitely sends 200b UDP
packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network
watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog
timeout is triggered due to race between cpsw_ndo_start_xmit() and
cpsw_tx_handler() [NAPI]

cpsw_ndo_start_xmit()
	if (unlikely(!cpdma_check_free_tx_desc(txch))) {
		txq = netdev_get_tx_queue(ndev, q_idx);
		netif_tx_stop_queue(txq);

^^ as per [1] barier has to be used after set_bit() otherwise new value
might not be visible to other cpus
	}

cpsw_tx_handler()
	if (unlikely(netif_tx_queue_stopped(txq)))
		netif_tx_wake_queue(txq);

and when it happens ndev TX queue became disabled forever while driver's HW
TX queue is empty.

Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue()
calls and double check for free TX descriptors after stopping ndev TX queue
- if there are free TX descriptors wake up ndev TX queue.

[1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Reviewed-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ti/cpsw.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3c85a0885f9bb..1b1b78fdc1384 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1636,6 +1636,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 		q_idx = q_idx % cpsw->tx_ch_num;
 
 	txch = cpsw->txv[q_idx].ch;
+	txq = netdev_get_tx_queue(ndev, q_idx);
 	ret = cpsw_tx_packet_submit(priv, skb, txch);
 	if (unlikely(ret != 0)) {
 		cpsw_err(priv, tx_err, "desc submit failed\n");
@@ -1646,15 +1647,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb,
 	 * tell the kernel to stop sending us tx frames.
 	 */
 	if (unlikely(!cpdma_check_free_tx_desc(txch))) {
-		txq = netdev_get_tx_queue(ndev, q_idx);
 		netif_tx_stop_queue(txq);
+
+		/* Barrier, so that stop_queue visible to other cpus */
+		smp_mb__after_atomic();
+
+		if (cpdma_check_free_tx_desc(txch))
+			netif_tx_wake_queue(txq);
 	}
 
 	return NETDEV_TX_OK;
 fail:
 	ndev->stats.tx_dropped++;
-	txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
 	netif_tx_stop_queue(txq);
+
+	/* Barrier, so that stop_queue visible to other cpus */
+	smp_mb__after_atomic();
+
+	if (cpdma_check_free_tx_desc(txch))
+		netif_tx_wake_queue(txq);
+
 	return NETDEV_TX_BUSY;
 }
 

From 043e337f555e610ad8237fd23522d97c968d72b9 Mon Sep 17 00:00:00 2001
From: "Md. Islam" <mislam4@kent.edu>
Date: Tue, 6 Feb 2018 23:14:18 -0500
Subject: [PATCH 378/676] sch_netem: Bug fixing in calculating Netem interval

In Kernel 4.15.0+, Netem does not work properly.

Netem setup:

tc qdisc add dev h1-eth0 root handle 1: netem delay 10ms 2ms

Result:

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=22.8 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=10.9 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=11.4 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=4303 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.2 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.3 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=4304 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=4303 ms

Patch:

(rnd % (2 * sigma)) - sigma was overflowing s32. After applying the
patch, I found following output which is desirable.

PING 172.16.101.2 (172.16.101.2) 56(84) bytes of data.
64 bytes from 172.16.101.2: icmp_seq=1 ttl=64 time=21.1 ms
64 bytes from 172.16.101.2: icmp_seq=2 ttl=64 time=8.46 ms
64 bytes from 172.16.101.2: icmp_seq=3 ttl=64 time=9.00 ms
64 bytes from 172.16.101.2: icmp_seq=4 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=5 ttl=64 time=8.36 ms
64 bytes from 172.16.101.2: icmp_seq=6 ttl=64 time=11.8 ms
64 bytes from 172.16.101.2: icmp_seq=7 ttl=64 time=8.11 ms
64 bytes from 172.16.101.2: icmp_seq=8 ttl=64 time=10.0 ms
64 bytes from 172.16.101.2: icmp_seq=9 ttl=64 time=11.3 ms
64 bytes from 172.16.101.2: icmp_seq=10 ttl=64 time=11.5 ms
64 bytes from 172.16.101.2: icmp_seq=11 ttl=64 time=10.2 ms

Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 7bbc13b8ca47f..7c179addebcd2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -327,7 +327,7 @@ static s64 tabledist(s64 mu, s32 sigma,
 
 	/* default uniform distribution */
 	if (dist == NULL)
-		return (rnd % (2 * sigma)) - sigma + mu;
+		return ((rnd % (2 * sigma)) + mu) - sigma;
 
 	t = dist->table[rnd % dist->size];
 	x = (sigma % NETEM_DIST_SCALE) * t;

From 5c487bb9adddbc1d23433e09d2548759375c2b52 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 6 Feb 2018 20:50:23 -0800
Subject: [PATCH 379/676] tcp: tracepoint: only call trace_tcp_send_reset with
 full socket

tracepoint tcp_send_reset requires a full socket to work. However, it
may be called when in TCP_TIME_WAIT:

        case TCP_TW_RST:
                tcp_v6_send_reset(sk, skb);
                inet_twsk_deschedule_put(inet_twsk(sk));
                goto discard_it;

To avoid this problem, this patch checks the socket with sk_fullsock()
before calling trace_tcp_send_reset().

Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset")
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95738aa0d8a60..f8ad397e285e9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	 */
 	if (sk) {
 		arg.bound_dev_if = sk->sk_bound_dev_if;
-		trace_tcp_send_reset(sk, skb);
+		if (sk_fullsock(sk))
+			trace_tcp_send_reset(sk, skb);
 	}
 
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a1ab29e2ab3bb..412139f4eccd9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -942,7 +942,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
-		trace_tcp_send_reset(sk, skb);
+		if (sk_fullsock(sk))
+			trace_tcp_send_reset(sk, skb);
 	}
 
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);

From 57ea5f161a7de5b1913c212d04f57a175b159fdf Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Mon, 5 Feb 2018 02:21:14 +0100
Subject: [PATCH 380/676] KVM: PPC: Book3S PR: Fix broken select due to
 misspelling

Commit 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code
on non-pseries platforms") added a reference to the globally undefined
symbol PPC_SERIES. Looking at the rest of the commit, PPC_PSERIES was
probably intended.

Change PPC_SERIES to PPC_PSERIES.

Discovered with the
https://github.com/ulfalizer/Kconfiglib/blob/master/examples/list_undefined.py
script.

Fixes: 76d837a4c0f9 ("KVM: PPC: Book3S PR: Don't include SPAPR TCE code on non-pseries platforms")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index b12b8eb39c297..648160334abf5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -68,7 +68,7 @@ config KVM_BOOK3S_64
 	select KVM_BOOK3S_64_HANDLER
 	select KVM
 	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
-	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_SERIES || PPC_POWERNV)
+	select SPAPR_TCE_IOMMU if IOMMU_SUPPORT && (PPC_PSERIES || PPC_POWERNV)
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.

From 48973df8c9c51612acc870ad1a885b0cf27c3356 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Mon, 5 Feb 2018 02:21:20 +0100
Subject: [PATCH 381/676] s390/kconfig: Remove ARCH_WANTS_PROT_NUMA_PROT_NONE
 select

The ARCH_WANTS_PROT_NUMA_PROT_NONE symbol was removed by
commit 6a33979d5bd7 ("mm: remove misleading ARCH_USES_NUMA_PROT_NONE"),
but S390 still selects it.

Remove the ARCH_WANTS_PROT_NUMA_PROT_NONE select from the S390 symbol.

Discovered with the
https://github.com/ulfalizer/Kconfiglib/blob/master/examples/list_undefined.py
script.

Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0636229dae227..eaee7087886fa 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -112,7 +112,6 @@ config S390
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
-	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select ARCH_WANTS_UBSAN_NO_NULL
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select BUILDTIME_EXTABLE_SORT

From 43cdd1b716b26f6af16da4e145b6578f98798bf6 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 19 Jan 2018 10:06:03 +0100
Subject: [PATCH 382/676] ACPI: sbshc: remove raw pointer from printk() message

There's no need to be printing a raw kernel pointer to the kernel log at
every boot.  So just remove it, and change the whole message to use the
correct dev_info() call at the same time.

Reported-by: Wang Qize <wang_qize@venustech.com.cn>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/sbshc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index 2fa8304171e09..7a3431018e0ab 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -275,8 +275,8 @@ static int acpi_smbus_hc_add(struct acpi_device *device)
 	device->driver_data = hc;
 
 	acpi_ec_add_query_handler(hc->ec, hc->query_bit, NULL, smbus_alarm, hc);
-	printk(KERN_INFO PREFIX "SBS HC: EC = 0x%p, offset = 0x%0x, query_bit = 0x%0x\n",
-		hc->ec, hc->offset, hc->query_bit);
+	dev_info(&device->dev, "SBS HC: offset = 0x%0x, query_bit = 0x%0x\n",
+		 hc->offset, hc->query_bit);
 
 	return 0;
 }

From 0725390da9fb5add90a6ee5d5bbe58b5f47b25ac Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 23 Jan 2018 02:09:02 +0000
Subject: [PATCH 383/676] cpufreq: scpi: fix error return code in
 scpi_cpufreq_init()

Fix to return a negative error code from the clk_get() error handling
case instead of 0, as done elsewhere in this function.

Fixes: 343a8d17fa8d (cpufreq: scpi: remove arm_big_little dependency)
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/scpi-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index efe32cbefa06b..c32a833e1b005 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -145,6 +145,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy)
 	if (IS_ERR(priv->clk)) {
 		dev_err(cpu_dev, "%s: Failed to get clk for cpu: %d\n",
 			__func__, cpu_dev->id);
+		ret = PTR_ERR(priv->clk);
 		goto out_free_cpufreq_table;
 	}
 

From 70f6bf2a3b7e40c3f802b0ea837762a8bc6c1430 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 29 Jan 2018 10:27:57 +0800
Subject: [PATCH 384/676] cpufreq: intel_pstate: Enable HWP during system
 resume on CPU0

When maxcpus=1 is in the kernel command line, the BP is responsible
for re-enabling the HWP - because currently only the APs invoke
intel_pstate_hwp_enable() during their online process - which might
put the system into unstable state after resume.

Fix this by enabling the HWP explicitly on BP during resume.

Reported-by: Doug Smythies <dsmythies@telus.net>
Suggested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Yu Chen <yu.c.chen@intel.com>
[ rjw: Subject/changelog, minor modifications ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 7edf7a0e5a96a..6d084c61ee253 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -779,6 +779,8 @@ static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void intel_pstate_hwp_enable(struct cpudata *cpudata);
+
 static int intel_pstate_resume(struct cpufreq_policy *policy)
 {
 	if (!hwp_active)
@@ -786,6 +788,9 @@ static int intel_pstate_resume(struct cpufreq_policy *policy)
 
 	mutex_lock(&intel_pstate_limits_lock);
 
+	if (policy->cpu == 0)
+		intel_pstate_hwp_enable(all_cpu_data[policy->cpu]);
+
 	all_cpu_data[policy->cpu]->epp_policy = 0;
 	intel_pstate_hwp_set(policy->cpu);
 

From ffd81dcfef85a33729f90e4acd2f61a68e56b993 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Tue, 30 Jan 2018 06:42:37 +0100
Subject: [PATCH 385/676] cpufreq: Add and use
 cpufreq_for_each_{valid_,}entry_idx()

Pointer subtraction is slow and tedious. Therefore, replace all instances
where cpufreq_for_each_{valid_,}entry loops contained such substractions
with an iteration macro providing an index to the frequency_table entry.

Suggested-by: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/r/20180120020237.GM13338@ZenIV.linux.org.uk
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpu-drivers.txt |   4 +
 drivers/cpufreq/exynos5440-cpufreq.c   |   7 +-
 drivers/cpufreq/freq_table.c           |   8 +-
 drivers/cpufreq/longhaul.c             |   4 +-
 drivers/cpufreq/pasemi-cpufreq.c       |   6 +-
 drivers/sh/clk/core.c                  |   5 +-
 drivers/staging/irda/drivers/sh_sir.c  |   4 +-
 include/linux/cpufreq.h                | 125 ++++++++++++++++---------
 8 files changed, 100 insertions(+), 63 deletions(-)

diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index 434c49cc7330a..61546ac578d60 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -291,3 +291,7 @@ For example:
 		/* Do something with pos */
 		pos->frequency = ...
 	}
+
+If you need to work with the position of pos within driver_freq_table,
+do not subtract the pointers, as it is quite costly. Instead, use the
+macros cpufreq_for_each_entry_idx() and cpufreq_for_each_valid_entry_idx().
diff --git a/drivers/cpufreq/exynos5440-cpufreq.c b/drivers/cpufreq/exynos5440-cpufreq.c
index b6b369c222726..932caa386ecec 100644
--- a/drivers/cpufreq/exynos5440-cpufreq.c
+++ b/drivers/cpufreq/exynos5440-cpufreq.c
@@ -115,10 +115,10 @@ static struct cpufreq_freqs freqs;
 static int init_div_table(void)
 {
 	struct cpufreq_frequency_table *pos, *freq_tbl = dvfs_info->freq_table;
-	unsigned int tmp, clk_div, ema_div, freq, volt_id;
+	unsigned int tmp, clk_div, ema_div, freq, volt_id, idx;
 	struct dev_pm_opp *opp;
 
-	cpufreq_for_each_entry(pos, freq_tbl) {
+	cpufreq_for_each_entry_idx(pos, freq_tbl, idx) {
 		opp = dev_pm_opp_find_freq_exact(dvfs_info->dev,
 					pos->frequency * 1000, true);
 		if (IS_ERR(opp)) {
@@ -154,8 +154,7 @@ static int init_div_table(void)
 		tmp = (clk_div | ema_div | (volt_id << P0_7_VDD_SHIFT)
 			| ((freq / FREQ_UNIT) << P0_7_FREQ_SHIFT));
 
-		__raw_writel(tmp, dvfs_info->base + XMU_PMU_P0_7 + 4 *
-						(pos - freq_tbl));
+		__raw_writel(tmp, dvfs_info->base + XMU_PMU_P0_7 + 4 * idx);
 		dev_pm_opp_put(opp);
 	}
 
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index 3bbbf9e6960cd..6d007f824ca74 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -143,10 +143,9 @@ int cpufreq_table_index_unsorted(struct cpufreq_policy *policy,
 		break;
 	}
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, i) {
 		freq = pos->frequency;
 
-		i = pos - table;
 		if ((freq < policy->min) || (freq > policy->max))
 			continue;
 		if (freq == target_freq) {
@@ -211,15 +210,16 @@ int cpufreq_frequency_table_get_index(struct cpufreq_policy *policy,
 		unsigned int freq)
 {
 	struct cpufreq_frequency_table *pos, *table = policy->freq_table;
+	int idx;
 
 	if (unlikely(!table)) {
 		pr_debug("%s: Unable to find frequency table\n", __func__);
 		return -ENOENT;
 	}
 
-	cpufreq_for_each_valid_entry(pos, table)
+	cpufreq_for_each_valid_entry_idx(pos, table, idx)
 		if (pos->frequency == freq)
-			return pos - table;
+			return idx;
 
 	return -EINVAL;
 }
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index 5faa37c5b0910..942632a27b50f 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -600,7 +600,7 @@ static void longhaul_setup_voltagescaling(void)
 	/* Calculate kHz for one voltage step */
 	kHz_step = (highest_speed - min_vid_speed) / numvscales;
 
-	cpufreq_for_each_entry(freq_pos, longhaul_table) {
+	cpufreq_for_each_entry_idx(freq_pos, longhaul_table, j) {
 		speed = freq_pos->frequency;
 		if (speed > min_vid_speed)
 			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
@@ -609,7 +609,7 @@ static void longhaul_setup_voltagescaling(void)
 		freq_pos->driver_data |= mV_vrm_table[pos] << 8;
 		vid = vrm_mV_table[mV_vrm_table[pos]];
 		pr_info("f: %d kHz, index: %d, vid: %d mV\n",
-			speed, (int)(freq_pos - longhaul_table), vid.mV);
+			speed, j, vid.mV);
 	}
 
 	can_scale_voltage = 1;
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index b257fc7d52041..75dfbd2a58ea6 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -139,7 +139,7 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *pos;
 	const u32 *max_freqp;
 	u32 max_freq;
-	int cur_astate;
+	int cur_astate, idx;
 	struct resource res;
 	struct device_node *cpu, *dn;
 	int err = -ENODEV;
@@ -198,9 +198,9 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	pr_debug("initializing frequency table\n");
 
 	/* initialize frequency table */
-	cpufreq_for_each_entry(pos, pas_freqs) {
+	cpufreq_for_each_entry_idx(pos, pas_freqs, idx) {
 		pos->frequency = get_astate_freq(pos->driver_data) * 100000;
-		pr_debug("%d: %d\n", (int)(pos - pas_freqs), pos->frequency);
+		pr_debug("%d: %d\n", idx, pos->frequency);
 	}
 
 	cur_astate = get_cur_astate(policy->cpu);
diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c
index 92863e3818e5c..9475353f49d6c 100644
--- a/drivers/sh/clk/core.c
+++ b/drivers/sh/clk/core.c
@@ -197,10 +197,11 @@ int clk_rate_table_find(struct clk *clk,
 			unsigned long rate)
 {
 	struct cpufreq_frequency_table *pos;
+	int idx;
 
-	cpufreq_for_each_valid_entry(pos, freq_table)
+	cpufreq_for_each_valid_entry_idx(pos, freq_table, idx)
 		if (pos->frequency == rate)
-			return pos - freq_table;
+			return idx;
 
 	return -ENOENT;
 }
diff --git a/drivers/staging/irda/drivers/sh_sir.c b/drivers/staging/irda/drivers/sh_sir.c
index fede6864c737b..0d0687cc454ab 100644
--- a/drivers/staging/irda/drivers/sh_sir.c
+++ b/drivers/staging/irda/drivers/sh_sir.c
@@ -226,7 +226,7 @@ static u32 sh_sir_find_sclk(struct clk *irda_clk)
 	clk_put(pclk);
 
 	/* IrDA can not set over peripheral_clk */
-	cpufreq_for_each_valid_entry(pos, freq_table) {
+	cpufreq_for_each_valid_entry_idx(pos, freq_table, index) {
 		u32 freq = pos->frequency;
 
 		/* IrDA should not over peripheral_clk */
@@ -236,7 +236,7 @@ static u32 sh_sir_find_sclk(struct clk *irda_clk)
 		tmp = freq % SCLK_BASE;
 		if (tmp < min) {
 			min = tmp;
-			index = pos - freq_table;
+			break;
 		}
 	}
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 065f3a8eb4861..21e8d248d9564 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -628,6 +628,18 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 #define cpufreq_for_each_entry(pos, table)	\
 	for (pos = table; pos->frequency != CPUFREQ_TABLE_END; pos++)
 
+/*
+ * cpufreq_for_each_entry_idx -	iterate over a cpufreq_frequency_table
+ *	with index
+ * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
+ * @table:	the cpufreq_frequency_table * to iterate over.
+ * @idx:	the table entry currently being processed
+ */
+
+#define cpufreq_for_each_entry_idx(pos, table, idx)	\
+	for (pos = table, idx = 0; pos->frequency != CPUFREQ_TABLE_END; \
+		pos++, idx++)
+
 /*
  * cpufreq_for_each_valid_entry -     iterate over a cpufreq_frequency_table
  *	excluding CPUFREQ_ENTRY_INVALID frequencies.
@@ -641,6 +653,21 @@ static inline void dev_pm_opp_free_cpufreq_table(struct device *dev,
 			continue;					\
 		else
 
+/*
+ * cpufreq_for_each_valid_entry_idx -     iterate with index over a cpufreq
+ *	frequency_table excluding CPUFREQ_ENTRY_INVALID frequencies.
+ * @pos:	the cpufreq_frequency_table * to use as a loop cursor.
+ * @table:	the cpufreq_frequency_table * to iterate over.
+ * @idx:	the table entry currently being processed
+ */
+
+#define cpufreq_for_each_valid_entry_idx(pos, table, idx)		\
+	cpufreq_for_each_entry_idx(pos, table, idx)			\
+		if (pos->frequency == CPUFREQ_ENTRY_INVALID)		\
+			continue;					\
+		else
+
+
 int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 				    struct cpufreq_frequency_table *table);
 
@@ -667,19 +694,20 @@ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq >= target_freq)
-			return pos - table;
+			return idx;
 
-		best = pos;
+		best = idx;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Find lowest freq at or above target in a table in descending order */
@@ -687,28 +715,29 @@ static inline int cpufreq_table_find_index_dl(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
-			return pos - table;
+			return idx;
 
 		if (freq > target_freq) {
-			best = pos;
+			best = idx;
 			continue;
 		}
 
 		/* No freq found above target_freq */
-		if (best == table - 1)
-			return pos - table;
+		if (best == -1)
+			return idx;
 
-		return best - table;
+		return best;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Works only on sorted freq-tables */
@@ -728,28 +757,29 @@ static inline int cpufreq_table_find_index_ah(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
-			return pos - table;
+			return idx;
 
 		if (freq < target_freq) {
-			best = pos;
+			best = idx;
 			continue;
 		}
 
 		/* No freq found below target_freq */
-		if (best == table - 1)
-			return pos - table;
+		if (best == -1)
+			return idx;
 
-		return best - table;
+		return best;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Find highest freq at or below target in a table in descending order */
@@ -757,19 +787,20 @@ static inline int cpufreq_table_find_index_dh(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq <= target_freq)
-			return pos - table;
+			return idx;
 
-		best = pos;
+		best = idx;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Works only on sorted freq-tables */
@@ -789,32 +820,33 @@ static inline int cpufreq_table_find_index_ac(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
-			return pos - table;
+			return idx;
 
 		if (freq < target_freq) {
-			best = pos;
+			best = idx;
 			continue;
 		}
 
 		/* No freq found below target_freq */
-		if (best == table - 1)
-			return pos - table;
+		if (best == -1)
+			return idx;
 
 		/* Choose the closest freq */
-		if (target_freq - best->frequency > freq - target_freq)
-			return pos - table;
+		if (target_freq - table[best].frequency > freq - target_freq)
+			return idx;
 
-		return best - table;
+		return best;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Find closest freq to target in a table in descending order */
@@ -822,32 +854,33 @@ static inline int cpufreq_table_find_index_dc(struct cpufreq_policy *policy,
 					      unsigned int target_freq)
 {
 	struct cpufreq_frequency_table *table = policy->freq_table;
-	struct cpufreq_frequency_table *pos, *best = table - 1;
+	struct cpufreq_frequency_table *pos;
 	unsigned int freq;
+	int idx, best = -1;
 
-	cpufreq_for_each_valid_entry(pos, table) {
+	cpufreq_for_each_valid_entry_idx(pos, table, idx) {
 		freq = pos->frequency;
 
 		if (freq == target_freq)
-			return pos - table;
+			return idx;
 
 		if (freq > target_freq) {
-			best = pos;
+			best = idx;
 			continue;
 		}
 
 		/* No freq found above target_freq */
-		if (best == table - 1)
-			return pos - table;
+		if (best == -1)
+			return idx;
 
 		/* Choose the closest freq */
-		if (best->frequency - target_freq > target_freq - freq)
-			return pos - table;
+		if (table[best].frequency - target_freq > target_freq - freq)
+			return idx;
 
-		return best - table;
+		return best;
 	}
 
-	return best - table;
+	return best;
 }
 
 /* Works only on sorted freq-tables */

From d0404738c687c0ecaa7d6b7c5c39e4c0dac791e6 Mon Sep 17 00:00:00 2001
From: Nicolas Chauvet <kwizart@gmail.com>
Date: Tue, 30 Jan 2018 10:55:26 +0100
Subject: [PATCH 386/676] arm: imx: Add MODULE_ALIAS for cpufreq

Without this, the imx6q-cpufreq driver isn't loaded
automatically when built as a module

Tested on wandboard quad with a fedora 27 kernel rpm

Signed-off-by: Nicolas Chauvet <kwizart@gmail.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/imx6q-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 741f22e5cee31..ff67859948b34 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -504,6 +504,7 @@ static struct platform_driver imx6q_cpufreq_platdrv = {
 };
 module_platform_driver(imx6q_cpufreq_platdrv);
 
+MODULE_ALIAS("platform:imx6q-cpufreq");
 MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
 MODULE_DESCRIPTION("Freescale i.MX6Q cpufreq driver");
 MODULE_LICENSE("GPL");

From 781198f1f373c3e350dbeb3af04a7d4c81c1b8d7 Mon Sep 17 00:00:00 2001
From: Simon Gaiser <simon@invisiblethingslab.com>
Date: Wed, 7 Feb 2018 21:47:40 +0100
Subject: [PATCH 387/676] xen: Fix {set,clear}_foreign_p2m_mapping on
 autotranslating guests

Commit 82616f9599a7 ("xen: remove tests for pvh mode in pure pv paths")
removed the check for autotranslation from {set,clear}_foreign_p2m_mapping
but those are called by grant-table.c also on PVH/HVM guests.

Cc: <stable@vger.kernel.org> # 4.14
Fixes: 82616f9599a7 ("xen: remove tests for pvh mode in pure pv paths")
Signed-off-by: Simon Gaiser <simon@invisiblethingslab.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/xen/p2m.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 13b4f19b91313..159a897151d64 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -694,6 +694,9 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 	int i, ret = 0;
 	pte_t *pte;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
 	if (kmap_ops) {
 		ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
 						kmap_ops, count);
@@ -736,6 +739,9 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 {
 	int i, ret = 0;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
 	for (i = 0; i < count; i++) {
 		unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
 		unsigned long pfn = page_to_pfn(pages[i]);

From b7d99235473ad3a550f8eb05bd4469edadf1c8e6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:27:12 -0800
Subject: [PATCH 388/676] nfp: bpf: fix immed relocation for larger offsets

Immed relocation is missing a shift which means for larger
offsets the lower and higher part of the address would be
ORed together.

Fixes: ce4ebfd859c3 ("nfp: bpf: add helpers for updating immediate instructions")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_asm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
index 3f6952b66a497..1e597600c6938 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c
@@ -107,7 +107,7 @@ u16 immed_get_value(u64 instr)
 	if (!unreg_is_imm(reg))
 		reg = FIELD_GET(OP_IMMED_B_SRC, instr);
 
-	return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr);
+	return (reg & 0xff) | FIELD_GET(OP_IMMED_IMM, instr) << 8;
 }
 
 void immed_set_value(u64 *instr, u16 immed)

From 0badd331491097cc3702d05a6dd0264a434712b2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:13 -0800
Subject: [PATCH 389/676] libbpf: complete list of strings for guessing program
 type

It seems that the type guessing feature for libbpf, based on the name of
the ELF section the program is located in, was inspired from
samples/bpf/prog_load.c, which was not used by any sample for loading
programs of certain types such as TC actions and classifiers, or
LWT-related types. As a consequence, libbpf is not able to guess the
type of such programs and to load them automatically if type is not
provided to the `bpf_load_prog()` function.

Add ELF section names associated to those eBPF program types so that
they can be loaded with e.g. bpftool as well.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 71ddc481f349d..c64840365433c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1816,12 +1816,17 @@ static const struct {
 	BPF_PROG_SEC("socket",		BPF_PROG_TYPE_SOCKET_FILTER),
 	BPF_PROG_SEC("kprobe/",		BPF_PROG_TYPE_KPROBE),
 	BPF_PROG_SEC("kretprobe/",	BPF_PROG_TYPE_KPROBE),
+	BPF_PROG_SEC("classifier",	BPF_PROG_TYPE_SCHED_CLS),
+	BPF_PROG_SEC("action",		BPF_PROG_TYPE_SCHED_ACT),
 	BPF_PROG_SEC("tracepoint/",	BPF_PROG_TYPE_TRACEPOINT),
 	BPF_PROG_SEC("xdp",		BPF_PROG_TYPE_XDP),
 	BPF_PROG_SEC("perf_event",	BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("cgroup/skb",	BPF_PROG_TYPE_CGROUP_SKB),
 	BPF_PROG_SEC("cgroup/sock",	BPF_PROG_TYPE_CGROUP_SOCK),
 	BPF_PROG_SEC("cgroup/dev",	BPF_PROG_TYPE_CGROUP_DEVICE),
+	BPF_PROG_SEC("lwt_in",		BPF_PROG_TYPE_LWT_IN),
+	BPF_PROG_SEC("lwt_out",		BPF_PROG_TYPE_LWT_OUT),
+	BPF_PROG_SEC("lwt_xmit",	BPF_PROG_TYPE_LWT_XMIT),
 	BPF_PROG_SEC("sockops",		BPF_PROG_TYPE_SOCK_OPS),
 	BPF_PROG_SEC("sk_skb",		BPF_PROG_TYPE_SK_SKB),
 };

From 92426820f7616caebdf6ba665f749102a670b3d4 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:14 -0800
Subject: [PATCH 390/676] tools: bpftool: exit doc Makefile early if rst2man is
 not available

If rst2man is not available on the system, running `make doc` from the
bpftool directory fails with an error message. However, it creates empty
manual pages (.8 files in this case). A subsequent call to `make
doc-install` would then succeed and install those empty man pages on the
system.

To prevent this, raise a Makefile error and exit immediately if rst2man
is not available before generating the pages from the rst documentation.

Fixes: ff69c21a85a4 ("tools: bpftool: add documentation")
Reported-by: Jason van Aaardt <jason.vanaardt@netronome.com>
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/bpf/bpftool/Documentation/Makefile b/tools/bpf/bpftool/Documentation/Makefile
index c462a928e03df..a9d47c1558bb1 100644
--- a/tools/bpf/bpftool/Documentation/Makefile
+++ b/tools/bpf/bpftool/Documentation/Makefile
@@ -23,7 +23,12 @@ DOC_MAN8 = $(addprefix $(OUTPUT),$(_DOC_MAN8))
 man: man8
 man8: $(DOC_MAN8)
 
+RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null)
+
 $(OUTPUT)%.8: %.rst
+ifndef RST2MAN_DEP
+	$(error "rst2man not found, but required to generate man pages")
+endif
 	$(QUIET_GEN)rst2man $< > $@
 
 clean:

From 2148481d5d3111e5e9aa5adc1905fa3539e548c8 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:15 -0800
Subject: [PATCH 391/676] tools: bpftool: make syntax for program map update
 explicit in man page

Specify in the documentation that when using bpftool to update a map of
type BPF_MAP_TYPE_PROG_ARRAY, the syntax for the program used as a value
should use the "id|tag|pinned" keywords convention, as used with
"bpftool prog" commands.

Fixes: ff69c21a85a4 ("tools: bpftool: add documentation")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/Documentation/bpftool-map.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 0ab32b312aec0..457e868bd32f4 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -31,7 +31,8 @@ MAP COMMANDS
 |	**bpftool** **map help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
-|	*VALUE* := { *BYTES* | *MAP* | *PROGRAM* }
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*VALUE* := { *BYTES* | *MAP* | *PROG* }
 |	*UPDATE_FLAGS* := { **any** | **exist** | **noexist** }
 
 DESCRIPTION

From 55f3538c4923e9dfca132e99ebec370e8094afda Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:16 -0800
Subject: [PATCH 392/676] tools: bpftool: add bash completion for `bpftool prog
 load`

Add bash completion for bpftool command `prog load`. Completion for this
command is easy, as it only takes existing file paths as arguments.

Fixes: 49a086c201a9 ("bpftool: implement prog load command")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/bpf/bpftool/bash-completion/bpftool | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 0137866bb8f69..17d246418348b 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -230,10 +230,14 @@ _bpftool()
                     fi
                     return 0
                     ;;
+                load)
+                    _filedir
+                    return 0
+                    ;;
                 *)
                     [[ $prev == $object ]] && \
-                        COMPREPLY=( $( compgen -W 'dump help pin show list' -- \
-                            "$cur" ) )
+                        COMPREPLY=( $( compgen -W 'dump help pin load \
+                            show list' -- "$cur" ) )
                     ;;
             esac
             ;;

From a827a164b98be2acba6a2e7559643ac9a5745086 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin.monnet@netronome.com>
Date: Wed, 7 Feb 2018 20:27:17 -0800
Subject: [PATCH 393/676] tools: bpftool: add bash completion for cgroup
 commands

Add bash completion for "bpftool cgroup" command family. While at it,
also fix the formatting of some keywords in the man page for cgroups.

Fixes: 5ccda64d38cc ("bpftool: implement cgroup bpf operations")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 .../bpftool/Documentation/bpftool-cgroup.rst  |  4 +-
 tools/bpf/bpftool/bash-completion/bpftool     | 64 +++++++++++++++++--
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 2fe2a1bdbe3e3..0e4e923235b6b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -26,8 +26,8 @@ MAP COMMANDS
 |	**bpftool** **cgroup help**
 |
 |	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
-|	*ATTACH_TYPE* := { *ingress* | *egress* | *sock_create* | *sock_ops* | *device* }
-|	*ATTACH_FLAGS* := { *multi* | *override* }
+|	*ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** }
+|	*ATTACH_FLAGS* := { **multi** | **override** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 17d246418348b..08719c54a614a 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -52,16 +52,24 @@ _bpftool_once_attr()
     done
 }
 
-# Takes a list of words in argument; adds them all to COMPREPLY if none of them
-# is already present on the command line. Returns no value.
-_bpftool_one_of_list()
+# Takes a list of words as argument; if any of those words is present on the
+# command line, return 0. Otherwise, return 1.
+_bpftool_search_list()
 {
     local w idx
     for w in $*; do
         for (( idx=3; idx < ${#words[@]}-1; idx++ )); do
-            [[ $w == ${words[idx]} ]] && return 1
+            [[ $w == ${words[idx]} ]] && return 0
         done
     done
+    return 1
+}
+
+# Takes a list of words in argument; adds them all to COMPREPLY if none of them
+# is already present on the command line. Returns no value.
+_bpftool_one_of_list()
+{
+    _bpftool_search_list $* && return 1
     COMPREPLY+=( $( compgen -W "$*" -- "$cur" ) )
 }
 
@@ -351,6 +359,54 @@ _bpftool()
                     ;;
             esac
             ;;
+        cgroup)
+            case $command in
+                show|list)
+                    _filedir
+                    return 0
+                    ;;
+                attach|detach)
+                    local ATTACH_TYPES='ingress egress sock_create sock_ops \
+                        device'
+                    local ATTACH_FLAGS='multi override'
+                    local PROG_TYPE='id pinned tag'
+                    case $prev in
+                        $command)
+                            _filedir
+                            return 0
+                            ;;
+                        ingress|egress|sock_create|sock_ops|device)
+                            COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
+                                "$cur" ) )
+                            return 0
+                            ;;
+                        id)
+                            _bpftool_get_prog_ids
+                            return 0
+                            ;;
+                        *)
+                            if ! _bpftool_search_list "$ATTACH_TYPES"; then
+                                COMPREPLY=( $( compgen -W "$ATTACH_TYPES" -- \
+                                    "$cur" ) )
+                            elif [[ "$command" == "attach" ]]; then
+                                # We have an attach type on the command line,
+                                # but it is not the previous word, or
+                                # "id|pinned|tag" (we already checked for
+                                # that). This should only leave the case when
+                                # we need attach flags for "attach" commamnd.
+                                _bpftool_one_of_list "$ATTACH_FLAGS"
+                            fi
+                            return 0
+                            ;;
+                    esac
+                    ;;
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help attach detach \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool

From eff84b379089cd8b4e83599639c1f5f6e34ef7bf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 24 Jan 2018 00:31:27 -0800
Subject: [PATCH 394/676] crypto: sha512-mb - initialize pending lengths
 correctly

The SHA-512 multibuffer code keeps track of the number of blocks pending
in each lane.  The minimum of these values is used to identify the next
lane that will be completed.  Unused lanes are set to a large number
(0xFFFFFFFF) so that they don't affect this calculation.

However, it was forgotten to set the lengths to this value in the
initial state, where all lanes are unused.  As a result it was possible
for sha512_mb_mgr_get_comp_job_avx2() to select an unused lane, causing
a NULL pointer dereference.  Specifically this could happen in the case
where ->update() was passed fewer than SHA512_BLOCK_SIZE bytes of data,
so it then called sha_complete_job() without having actually submitted
any blocks to the multi-buffer code.  This hit a NULL pointer
dereference if another task happened to have submitted blocks
concurrently to the same CPU and the flush timer had not yet expired.

Fix this by initializing sha512_mb_mgr->lens correctly.

As usual, this bug was found by syzkaller.

Fixes: 45691e2d9b18 ("crypto: sha512-mb - submit/flush routines for AVX2")
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org> # v4.8+
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
index 36870b26067a7..d08805032f019 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_init_avx2.c
@@ -57,10 +57,12 @@ void sha512_mb_mgr_init_avx2(struct sha512_mb_mgr *state)
 {
 	unsigned int j;
 
-	state->lens[0] = 0;
-	state->lens[1] = 1;
-	state->lens[2] = 2;
-	state->lens[3] = 3;
+	/* initially all lanes are unused */
+	state->lens[0] = 0xFFFFFFFF00000000;
+	state->lens[1] = 0xFFFFFFFF00000001;
+	state->lens[2] = 0xFFFFFFFF00000002;
+	state->lens[3] = 0xFFFFFFFF00000003;
+
 	state->unused_lanes = 0xFF03020100;
 	for (j = 0; j < 4; j++)
 		state->ldata[j].job_in_lane = NULL;

From 87a81dce53b1ea61acaeefa5191a0376a2d1d721 Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Fri, 26 Jan 2018 17:09:59 +0100
Subject: [PATCH 395/676] crypto: talitos - fix Kernel Oops on hashing an empty
 file

Performing the hash of an empty file leads to a kernel Oops

[   44.504600] Unable to handle kernel paging request for data at address 0x0000000c
[   44.512819] Faulting instruction address: 0xc02d2be8
[   44.524088] Oops: Kernel access of bad area, sig: 11 [#1]
[   44.529171] BE PREEMPT CMPC885
[   44.532232] CPU: 0 PID: 491 Comm: md5sum Not tainted 4.15.0-rc8-00211-g3a968610b6ea #81
[   44.540814] NIP:  c02d2be8 LR: c02d2984 CTR: 00000000
[   44.545812] REGS: c6813c90 TRAP: 0300   Not tainted  (4.15.0-rc8-00211-g3a968610b6ea)
[   44.554223] MSR:  00009032 <EE,ME,IR,DR,RI>  CR: 48222822  XER: 20000000
[   44.560855] DAR: 0000000c DSISR: c0000000
[   44.560855] GPR00: c02d28fc c6813d40 c6828000 c646fa40 00000001 00000001 00000001 00000000
[   44.560855] GPR08: 0000004c 00000000 c000bfcc 00000000 28222822 100280d4 00000000 10020008
[   44.560855] GPR16: 00000000 00000020 00000000 00000000 10024008 00000000 c646f9f0 c6179a10
[   44.560855] GPR24: 00000000 00000001 c62f0018 c6179a10 00000000 c6367a30 c62f0000 c646f9c0
[   44.598542] NIP [c02d2be8] ahash_process_req+0x448/0x700
[   44.603751] LR [c02d2984] ahash_process_req+0x1e4/0x700
[   44.608868] Call Trace:
[   44.611329] [c6813d40] [c02d28fc] ahash_process_req+0x15c/0x700 (unreliable)
[   44.618302] [c6813d90] [c02060c4] hash_recvmsg+0x11c/0x210
[   44.623716] [c6813db0] [c0331354] ___sys_recvmsg+0x98/0x138
[   44.629226] [c6813eb0] [c03332c0] __sys_recvmsg+0x40/0x84
[   44.634562] [c6813f10] [c03336c0] SyS_socketcall+0xb8/0x1d4
[   44.640073] [c6813f40] [c000d1ac] ret_from_syscall+0x0/0x38
[   44.645530] Instruction dump:
[   44.648465] 38c00001 7f63db78 4e800421 7c791b78 54690ffe 0f090000 80ff0190 2f870000
[   44.656122] 40befe50 2f990001 409e0210 813f01bc <8129000c> b39e003a 7d29c214 913e003c

This patch fixes that Oops by checking if src is NULL.

Fixes: 6a1e8d14156d4 ("crypto: talitos - making mapping helpers more generic")
Cc: <stable@vger.kernel.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/talitos.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 9c80e0cb16647..6882fa2f8badd 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -1138,6 +1138,10 @@ static int talitos_sg_map(struct device *dev, struct scatterlist *src,
 	struct talitos_private *priv = dev_get_drvdata(dev);
 	bool is_sec1 = has_ftr_sec1(priv);
 
+	if (!src) {
+		to_talitos_ptr(ptr, 0, 0, is_sec1);
+		return 1;
+	}
 	if (sg_count == 1) {
 		to_talitos_ptr(ptr, sg_dma_address(src) + offset, len, is_sec1);
 		return sg_count;

From 4767b9ad7d762876a5865a06465e13e139a01b6b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 27 Jan 2018 09:18:32 +0000
Subject: [PATCH 396/676] crypto: sha3-generic - deal with oversize stack
 frames

As reported by kbuild test robot, the optimized SHA3 C implementation
compiles to mn10300 code that uses a disproportionate amount of stack
space, i.e.,

  crypto/sha3_generic.c: In function 'keccakf':
  crypto/sha3_generic.c:147:1: warning: the frame size of 1232 bytes is larger than 1024 bytes [-Wframe-larger-than=]

As kindly diagnosed by Arnd, this does not only occur when building for
the mn10300 architecture (which is what the report was about) but also
for h8300, and builds for other 32-bit architectures show an increase in
stack space utilization as well.

Given that SHA3 operates on 64-bit quantities, and keeps a state matrix
of 25 64-bit words, it is not surprising that 32-bit architectures with
few general purpose registers are impacted the most by this, and it is
therefore reasonable to implement a workaround that distinguishes between
32-bit and 64-bit architectures.

Arnd figured out that taking the round calculation out of the loop, and
inlining it explicitly but only on 64-bit architectures preserves most
of the performance gain achieved by the rewrite, and also gets rid of
the excessive use of stack space.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/sha3_generic.c | 218 +++++++++++++++++++++++-------------------
 1 file changed, 118 insertions(+), 100 deletions(-)

diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
index a965b9d805598..951c4eb70262c 100644
--- a/crypto/sha3_generic.c
+++ b/crypto/sha3_generic.c
@@ -20,6 +20,20 @@
 #include <crypto/sha3.h>
 #include <asm/unaligned.h>
 
+/*
+ * On some 32-bit architectures (mn10300 and h8300), GCC ends up using
+ * over 1 KB of stack if we inline the round calculation into the loop
+ * in keccakf(). On the other hand, on 64-bit architectures with plenty
+ * of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance. So let's use 64-bitness as a heuristic to decide
+ * whether to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE	inline
+#else
+#define SHA3_INLINE	noinline
+#endif
+
 #define KECCAK_ROUNDS 24
 
 static const u64 keccakf_rndc[24] = {
@@ -35,111 +49,115 @@ static const u64 keccakf_rndc[24] = {
 
 /* update the state with given number of rounds */
 
-static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
+static SHA3_INLINE void keccakf_round(u64 st[25])
 {
 	u64 t[5], tt, bc[5];
-	int round;
 
-	for (round = 0; round < KECCAK_ROUNDS; round++) {
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+}
 
-		/* Theta */
-		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-		t[0] = bc[4] ^ rol64(bc[1], 1);
-		t[1] = bc[0] ^ rol64(bc[2], 1);
-		t[2] = bc[1] ^ rol64(bc[3], 1);
-		t[3] = bc[2] ^ rol64(bc[4], 1);
-		t[4] = bc[3] ^ rol64(bc[0], 1);
-
-		st[0] ^= t[0];
-
-		/* Rho Pi */
-		tt = st[1];
-		st[ 1] = rol64(st[ 6] ^ t[1], 44);
-		st[ 6] = rol64(st[ 9] ^ t[4], 20);
-		st[ 9] = rol64(st[22] ^ t[2], 61);
-		st[22] = rol64(st[14] ^ t[4], 39);
-		st[14] = rol64(st[20] ^ t[0], 18);
-		st[20] = rol64(st[ 2] ^ t[2], 62);
-		st[ 2] = rol64(st[12] ^ t[2], 43);
-		st[12] = rol64(st[13] ^ t[3], 25);
-		st[13] = rol64(st[19] ^ t[4],  8);
-		st[19] = rol64(st[23] ^ t[3], 56);
-		st[23] = rol64(st[15] ^ t[0], 41);
-		st[15] = rol64(st[ 4] ^ t[4], 27);
-		st[ 4] = rol64(st[24] ^ t[4], 14);
-		st[24] = rol64(st[21] ^ t[1],  2);
-		st[21] = rol64(st[ 8] ^ t[3], 55);
-		st[ 8] = rol64(st[16] ^ t[1], 45);
-		st[16] = rol64(st[ 5] ^ t[0], 36);
-		st[ 5] = rol64(st[ 3] ^ t[3], 28);
-		st[ 3] = rol64(st[18] ^ t[3], 21);
-		st[18] = rol64(st[17] ^ t[2], 15);
-		st[17] = rol64(st[11] ^ t[1], 10);
-		st[11] = rol64(st[ 7] ^ t[2],  6);
-		st[ 7] = rol64(st[10] ^ t[0],  3);
-		st[10] = rol64(    tt ^ t[1],  1);
-
-		/* Chi */
-		bc[ 0] = ~st[ 1] & st[ 2];
-		bc[ 1] = ~st[ 2] & st[ 3];
-		bc[ 2] = ~st[ 3] & st[ 4];
-		bc[ 3] = ~st[ 4] & st[ 0];
-		bc[ 4] = ~st[ 0] & st[ 1];
-		st[ 0] ^= bc[ 0];
-		st[ 1] ^= bc[ 1];
-		st[ 2] ^= bc[ 2];
-		st[ 3] ^= bc[ 3];
-		st[ 4] ^= bc[ 4];
-
-		bc[ 0] = ~st[ 6] & st[ 7];
-		bc[ 1] = ~st[ 7] & st[ 8];
-		bc[ 2] = ~st[ 8] & st[ 9];
-		bc[ 3] = ~st[ 9] & st[ 5];
-		bc[ 4] = ~st[ 5] & st[ 6];
-		st[ 5] ^= bc[ 0];
-		st[ 6] ^= bc[ 1];
-		st[ 7] ^= bc[ 2];
-		st[ 8] ^= bc[ 3];
-		st[ 9] ^= bc[ 4];
-
-		bc[ 0] = ~st[11] & st[12];
-		bc[ 1] = ~st[12] & st[13];
-		bc[ 2] = ~st[13] & st[14];
-		bc[ 3] = ~st[14] & st[10];
-		bc[ 4] = ~st[10] & st[11];
-		st[10] ^= bc[ 0];
-		st[11] ^= bc[ 1];
-		st[12] ^= bc[ 2];
-		st[13] ^= bc[ 3];
-		st[14] ^= bc[ 4];
-
-		bc[ 0] = ~st[16] & st[17];
-		bc[ 1] = ~st[17] & st[18];
-		bc[ 2] = ~st[18] & st[19];
-		bc[ 3] = ~st[19] & st[15];
-		bc[ 4] = ~st[15] & st[16];
-		st[15] ^= bc[ 0];
-		st[16] ^= bc[ 1];
-		st[17] ^= bc[ 2];
-		st[18] ^= bc[ 3];
-		st[19] ^= bc[ 4];
-
-		bc[ 0] = ~st[21] & st[22];
-		bc[ 1] = ~st[22] & st[23];
-		bc[ 2] = ~st[23] & st[24];
-		bc[ 3] = ~st[24] & st[20];
-		bc[ 4] = ~st[20] & st[21];
-		st[20] ^= bc[ 0];
-		st[21] ^= bc[ 1];
-		st[22] ^= bc[ 2];
-		st[23] ^= bc[ 3];
-		st[24] ^= bc[ 4];
+static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
+{
+	int round;
 
+	for (round = 0; round < KECCAK_ROUNDS; round++) {
+		keccakf_round(st);
 		/* Iota */
 		st[0] ^= keccakf_rndc[round];
 	}

From df5d45aa08f848b79caf395211b222790534ccc7 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 1 Feb 2018 11:21:58 +0100
Subject: [PATCH 397/676] compiler-gcc.h: Introduce __optimize function
 attribute

Create a new function attribute __optimize, which allows to specify an
optimization level on a per-function basis.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/compiler-gcc.h | 4 ++++
 include/linux/compiler.h     | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 2272ded07496d..7bba8e28c5291 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -196,6 +196,10 @@
 #endif /* __CHECKER__ */
 #endif /* GCC_VERSION >= 40300 */
 
+#if GCC_VERSION >= 40400
+#define __optimize(level)	__attribute__((__optimize__(level)))
+#endif /* GCC_VERSION >= 40400 */
+
 #if GCC_VERSION >= 40500
 
 #ifndef __CHECKER__
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 188ed9f655174..cdc629f20e202 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -271,6 +271,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 
 #endif /* __ASSEMBLY__ */
 
+#ifndef __optimize
+# define __optimize(level)
+#endif
+
 /* Compile time object size, -1 for unknown */
 #ifndef __compiletime_object_size
 # define __compiletime_object_size(obj) -1

From d9afaaa4ff7af8b87d4a205e48cb8a6f666d7f01 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 1 Feb 2018 11:21:59 +0100
Subject: [PATCH 398/676] compiler-gcc.h: __nostackprotector needs gcc-4.4 and
 up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gcc versions before 4.4 do not recognize the __optimize__ compiler
attribute:

    warning: ‘__optimize__’ attribute directive ignored

Fixes: 7375ae3a0b79ea07 ("compiler-gcc.h: Introduce __nostackprotector function attribute")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/compiler-gcc.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7bba8e28c5291..bf09213895f7a 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -167,8 +167,6 @@
 
 #if GCC_VERSION >= 40100
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-
-#define __nostackprotector	__attribute__((__optimize__("no-stack-protector")))
 #endif
 
 #if GCC_VERSION >= 40300
@@ -198,6 +196,7 @@
 
 #if GCC_VERSION >= 40400
 #define __optimize(level)	__attribute__((__optimize__(level)))
+#define __nostackprotector	__optimize("no-stack-protector")
 #endif /* GCC_VERSION >= 40400 */
 
 #if GCC_VERSION >= 40500

From ba916b6a0339ed6cc6441ad83c097ab795dbdbc5 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 1 Feb 2018 11:22:00 +0100
Subject: [PATCH 399/676] crypto: sha3-generic - Use __optimize to support old
 compilers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2:

    crypto/sha3_generic.c:39: warning: ‘__optimize__’ attribute directive ignored

Use the newly introduced __optimize macro to fix this.

Fixes: 83dee2ce1ae791c3 ("crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/sha3_generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
index 951c4eb70262c..ded1487833033 100644
--- a/crypto/sha3_generic.c
+++ b/crypto/sha3_generic.c
@@ -152,7 +152,7 @@ static SHA3_INLINE void keccakf_round(u64 st[25])
 	st[24] ^= bc[ 4];
 }
 
-static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25])
+static void __optimize("O3") keccakf(u64 st[25])
 {
 	int round;
 

From 225ece3e7dad4cfc44cca38ce7a3a80f255ea8f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Horia=20Geant=C4=83?= <horia.geanta@nxp.com>
Date: Mon, 5 Feb 2018 11:15:52 +0200
Subject: [PATCH 400/676] crypto: caam - fix endless loop when DECO acquire
 fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In case DECO0 cannot be acquired - i.e. run_descriptor_deco0() fails
with -ENODEV, caam_probe() enters an endless loop:

run_descriptor_deco0
	ret -ENODEV
	-> instantiate_rng
		-ENODEV, overwritten by -EAGAIN
		ret -EAGAIN
		-> caam_probe
			-EAGAIN results in endless loop

It turns out the error path in instantiate_rng() is incorrect,
the checks are done in the wrong order.

Cc: <stable@vger.kernel.org> # 3.13+
Fixes: 1005bccd7a4a6 ("crypto: caam - enable instantiation of all RNG4 state handles")
Reported-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Suggested-by: Auer Lukas <lukas.auer@aisec.fraunhofer.de>
Signed-off-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/caam/ctrl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 75d280cb2dc05..e843cf4103736 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -228,12 +228,16 @@ static int instantiate_rng(struct device *ctrldev, int state_handle_mask,
 		 * without any error (HW optimizations for later
 		 * CAAM eras), then try again.
 		 */
+		if (ret)
+			break;
+
 		rdsta_val = rd_reg32(&ctrl->r4tst[0].rdsta) & RDSTA_IFMASK;
 		if ((status && status != JRSTA_SSRC_JUMP_HALT_CC) ||
-		    !(rdsta_val & (1 << sh_idx)))
+		    !(rdsta_val & (1 << sh_idx))) {
 			ret = -EAGAIN;
-		if (ret)
 			break;
+		}
+
 		dev_info(ctrldev, "Instantiated RNG4 SH%d\n", sh_idx);
 		/* Clear the contents before recreating the descriptor */
 		memset(desc, 0x00, CAAM_CMD_SZ * 7);

From dd78c832ffaf86eb6434e56de4bc3bc31f03f771 Mon Sep 17 00:00:00 2001
From: Artem Savkov <artem.savkov@gmail.com>
Date: Tue, 6 Feb 2018 22:20:21 +0100
Subject: [PATCH 401/676] crypto: sun4i_ss_prng - fix return value of
 sun4i_ss_prng_generate

According to crypto/rng.h generate function should return 0 on success
and < 0 on error.

Fixes: b8ae5c7387ad ("crypto: sun4i-ss - support the Security System PRNG")
Signed-off-by: Artem Savkov <artem.savkov@gmail.com>
Acked-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/sunxi-ss/sun4i-ss-prng.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-prng.c b/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
index 0d01d16242527..5754e0b92fb0c 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
@@ -52,5 +52,5 @@ int sun4i_ss_prng_generate(struct crypto_rng *tfm, const u8 *src,
 
 	writel(0, ss->base + SS_CTL);
 	spin_unlock(&ss->slock);
-	return dlen;
+	return 0;
 }

From 2e7d1d61ea6c0f1c4da5eb82cafac750d55637a7 Mon Sep 17 00:00:00 2001
From: Artem Savkov <artem.savkov@gmail.com>
Date: Tue, 6 Feb 2018 22:20:22 +0100
Subject: [PATCH 402/676] crypto: sun4i_ss_prng - convert lock to _bh in
 sun4i_ss_prng_generate

Lockdep detects a possible deadlock in sun4i_ss_prng_generate() and
throws an "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage" warning.
Disabling softirqs to fix this.

Fixes: b8ae5c7387ad ("crypto: sun4i-ss - support the Security System PRNG")
Signed-off-by: Artem Savkov <artem.savkov@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/sunxi-ss/sun4i-ss-prng.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/crypto/sunxi-ss/sun4i-ss-prng.c b/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
index 5754e0b92fb0c..63d636424161d 100644
--- a/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
+++ b/drivers/crypto/sunxi-ss/sun4i-ss-prng.c
@@ -28,7 +28,7 @@ int sun4i_ss_prng_generate(struct crypto_rng *tfm, const u8 *src,
 	algt = container_of(alg, struct sun4i_ss_alg_template, alg.rng);
 	ss = algt->ss;
 
-	spin_lock(&ss->slock);
+	spin_lock_bh(&ss->slock);
 
 	writel(mode, ss->base + SS_CTL);
 
@@ -51,6 +51,6 @@ int sun4i_ss_prng_generate(struct crypto_rng *tfm, const u8 *src,
 	}
 
 	writel(0, ss->base + SS_CTL);
-	spin_unlock(&ss->slock);
+	spin_unlock_bh(&ss->slock);
 	return 0;
 }

From 9ce8b24aa96e983a540bb8244298a10322881ef4 Mon Sep 17 00:00:00 2001
From: Ryan Hsu <ryanhsu@codeaurora.org>
Date: Wed, 7 Feb 2018 15:51:23 +0200
Subject: [PATCH 403/676] Revert "ath10k: add sanity check to ie_len before
 parsing fw/board ie"

This reverts commit 9ed4f91628737c820af6a1815b65bc06bd31518f.

The commit introduced a regression that over read the ie with
the padding.

- the expected IE information

ath10k_pci 0000:03:00.0: found firmware features ie (1 B)
ath10k_pci 0000:03:00.0: Enabling feature bit: 6
ath10k_pci 0000:03:00.0: Enabling feature bit: 7
ath10k_pci 0000:03:00.0: features
ath10k_pci 0000:03:00.0: 00000000: c0 00 00 00 00 00 00 00

- the wrong IE with padding is read (0x77)

ath10k_pci 0000:03:00.0: found firmware features ie (4 B)
ath10k_pci 0000:03:00.0: Enabling feature bit: 6
ath10k_pci 0000:03:00.0: Enabling feature bit: 7
ath10k_pci 0000:03:00.0: Enabling feature bit: 8
ath10k_pci 0000:03:00.0: Enabling feature bit: 9
ath10k_pci 0000:03:00.0: Enabling feature bit: 10
ath10k_pci 0000:03:00.0: Enabling feature bit: 12
ath10k_pci 0000:03:00.0: Enabling feature bit: 13
ath10k_pci 0000:03:00.0: Enabling feature bit: 14
ath10k_pci 0000:03:00.0: Enabling feature bit: 16
ath10k_pci 0000:03:00.0: Enabling feature bit: 17
ath10k_pci 0000:03:00.0: Enabling feature bit: 18
ath10k_pci 0000:03:00.0: features
ath10k_pci 0000:03:00.0: 00000000: c0 77 07 00 00 00 00 00

Tested-by: Mike Lothian <mike@fireburn.co.uk>
Signed-off-by: Ryan Hsu <ryanhsu@codeaurora.org>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/ath/ath10k/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index 6fb282f76804c..f3ec13b80b20c 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -1305,10 +1305,7 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 		len -= sizeof(*hdr);
 		data = hdr->data;
 
-		/* jump over the padding */
-		ie_len = ALIGN(ie_len, 4);
-
-		if (len < ie_len) {
+		if (len < ALIGN(ie_len, 4)) {
 			ath10k_err(ar, "invalid length for board ie_id %d ie_len %zu len %zu\n",
 				   ie_id, ie_len, len);
 			ret = -EINVAL;
@@ -1347,6 +1344,9 @@ static int ath10k_core_fetch_board_data_api_n(struct ath10k *ar,
 			goto out;
 		}
 
+		/* jump over the padding */
+		ie_len = ALIGN(ie_len, 4);
+
 		len -= ie_len;
 		data += ie_len;
 	}
@@ -1477,9 +1477,6 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name,
 		len -= sizeof(*hdr);
 		data += sizeof(*hdr);
 
-		/* jump over the padding */
-		ie_len = ALIGN(ie_len, 4);
-
 		if (len < ie_len) {
 			ath10k_err(ar, "invalid length for FW IE %d (%zu < %zu)\n",
 				   ie_id, len, ie_len);
@@ -1585,6 +1582,9 @@ int ath10k_core_fetch_firmware_api_n(struct ath10k *ar, const char *name,
 			break;
 		}
 
+		/* jump over the padding */
+		ie_len = ALIGN(ie_len, 4);
+
 		len -= ie_len;
 		data += ie_len;
 	}

From 1d9a090783bef19fe8cdec878620d22f05191316 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Fri, 26 Jan 2018 13:41:59 -0600
Subject: [PATCH 404/676] powerpc/numa: Invalidate numa_cpu_lookup_table on cpu
 remove

When DLPAR removing a CPU, the unmapping of the cpu from a node in
unmap_cpu_from_node() should also invalidate the CPUs entry in the
numa_cpu_lookup_table. There is not a guarantee that on a subsequent
DLPAR add of the CPU the associativity will be the same and thus
could be in a different node. Invalidating the entry in the
numa_cpu_lookup_table causes the associativity to be read from the
device tree at the time of the add.

The current behavior of not invalidating the CPUs entry in the
numa_cpu_lookup_table can result in scenarios where the the topology
layout of CPUs in the partition does not match the device tree
or the topology reported by the HMC.

This bug looks like it was introduced in 2004 in the commit titled
"ppc64: cpu hotplug notifier for numa", which is 6b15e4e87e32 in the
linux-fullhist tree. Hence tag it for all stable releases.

Cc: stable@vger.kernel.org
Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Reviewed-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/topology.h          | 5 +++++
 arch/powerpc/mm/numa.c                       | 5 -----
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 ++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 88187c285c70d..1c02e6900f785 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -44,6 +44,11 @@ extern int sysfs_add_device_to_node(struct device *dev, int nid);
 extern void sysfs_remove_device_from_node(struct device *dev, int nid);
 extern int numa_update_cpu_topology(bool cpus_locked);
 
+static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
+{
+	numa_cpu_lookup_table[cpu] = node;
+}
+
 static inline int early_cpu_to_node(int cpu)
 {
 	int nid;
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 314d19ab9385e..edd8d0bc9364f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -143,11 +143,6 @@ static void reset_numa_cpu_lookup_table(void)
 		numa_cpu_lookup_table[cpu] = -1;
 }
 
-static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
-{
-	numa_cpu_lookup_table[cpu] = node;
-}
-
 static void map_cpu_to_node(int cpu, int node)
 {
 	update_numa_cpu_lookup_table(cpu, node);
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index dceb51454d8d2..f78fd2068d56a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -36,6 +36,7 @@
 #include <asm/xics.h>
 #include <asm/xive.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/topology.h>
 
 #include "pseries.h"
 #include "offline_states.h"
@@ -331,6 +332,7 @@ static void pseries_remove_processor(struct device_node *np)
 			BUG_ON(cpu_online(cpu));
 			set_cpu_present(cpu, false);
 			set_hard_smp_processor_id(cpu, -1);
+			update_numa_cpu_lookup_table(cpu, -1);
 			break;
 		}
 		if (cpu >= nr_cpu_ids)

From 5c11d1e52d996749897a8616860b18a084c894f0 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Date: Tue, 6 Feb 2018 18:06:37 +0530
Subject: [PATCH 405/676] powerpc/64s: Fix MASKABLE_RELON_EXCEPTION_HV_OOL
 macro

Commit f14e953b191f ("powerpc/64s: Add support to take additional
parameter in MASKABLE_* macro") messed up MASKABLE_RELON_EXCEPTION_HV_OOL
macro by adding the wrong SOFTEN test which caused guest kernel crash
at boot. Patch to fix the macro to use SOFTEN_TEST_HV instead of
SOFTEN_NOTEST_HV.

Fixes: f14e953b191f ("powerpc/64s: Add support to take additional parameter in MASKABLE_* macro")
Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Fix-Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 176dfb73d42c0..471b2274fbeba 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -645,7 +645,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 					  EXC_HV, SOFTEN_TEST_HV, bitmask)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label, bitmask)		\
-	MASKABLE_EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec, bitmask);\
+	MASKABLE_EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec, bitmask);\
 	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*

From 6cc3f91bf69fc8c1719704607474f9b9df56f348 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 3 Feb 2018 17:17:50 +1000
Subject: [PATCH 406/676] powerpc/64s: Fix may_hard_irq_enable() for PMI soft
 masking

The soft IRQ masking code has to hard-disable interrupts in cases
where the exception is not cleared by the masked handler. External
interrupts used this approach for soft masking. Now recently PMU
interrupts do the same thing.

The soft IRQ masking code additionally allowed for interrupt handlers
to hard-enable interrupts after soft-disabling them. The idea is to
allow PMU interrupts through to profile interrupt handlers.

So when interrupts are being replayed when there is a pending
interrupt that requires hard-disabling, there is a test to prevent
those handlers from hard-enabling them if there is a pending external
interrupt. may_hard_irq_enable() handles this.

After f442d00480 ("powerpc/64s: Add support to mask perf interrupts
and replay them"), may_hard_irq_enable() could prematurely enable
MSR[EE] when a PMU exception exists, which would result in the
interrupt firing again while masked, and MSR[EE] being disabled again.

I haven't seen that this could cause a serious problem, but it's
more consistent to handle these soft-masked interrupts in the same
way. So introduce a define for all types of interrupts that require
MSR[EE] masking in their soft-disable handlers, and use that in
may_hard_irq_enable().

Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and replay them")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hw_irq.h    | 12 +++++++++++-
 arch/powerpc/kernel/exceptions-64e.S |  2 ++
 arch/powerpc/kernel/exceptions-64s.S |  6 +++---
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 88e5e8f17e989..855e17d158b11 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -29,6 +29,16 @@
 #define PACA_IRQ_HMI		0x20
 #define PACA_IRQ_PMI		0x40
 
+/*
+ * Some soft-masked interrupts must be hard masked until they are replayed
+ * (e.g., because the soft-masked handler does not clear the exception).
+ */
+#ifdef CONFIG_PPC_BOOK3S
+#define PACA_IRQ_MUST_HARD_MASK	(PACA_IRQ_EE|PACA_IRQ_PMI)
+#else
+#define PACA_IRQ_MUST_HARD_MASK	(PACA_IRQ_EE)
+#endif
+
 /*
  * flags for paca->irq_soft_mask
  */
@@ -244,7 +254,7 @@ static inline bool lazy_irq_pending(void)
 static inline void may_hard_irq_enable(void)
 {
 	get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS;
-	if (!(get_paca()->irq_happened & PACA_IRQ_EE))
+	if (!(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK))
 		__hard_irq_enable();
 }
 
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index ee832d344a5a2..9b6e653e501a1 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -943,6 +943,8 @@ kernel_dbg_exc:
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
  * accordingly and if the interrupt is level sensitive, we hard disable
+ * hard disable (full_mask) corresponds to PACA_IRQ_MUST_HARD_MASK, so
+ * keep these in synch.
  */
 
 .macro masked_interrupt_book3e paca_irq full_mask
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 243d072a225aa..3ac87e53b3da0 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1426,7 +1426,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
  *   triggered and won't automatically refire.
  * - If it was a HMI we return immediately since we handled it in realmode
  *   and it won't refire.
- * - else we hard disable and return.
+ * - Else it is one of PACA_IRQ_MUST_HARD_MASK, so hard disable and return.
  * This is called with r10 containing the value to OR to the paca field.
  */
 #define MASKED_INTERRUPT(_H)				\
@@ -1441,8 +1441,8 @@ masked_##_H##interrupt:					\
 	ori	r10,r10,0xffff;				\
 	mtspr	SPRN_DEC,r10;				\
 	b	MASKED_DEC_HANDLER_LABEL;		\
-1:	andi.	r10,r10,(PACA_IRQ_DBELL|PACA_IRQ_HMI);	\
-	bne	2f;					\
+1:	andi.	r10,r10,PACA_IRQ_MUST_HARD_MASK;	\
+	beq	2f;					\
 	mfspr	r10,SPRN_##_H##SRR1;			\
 	xori	r10,r10,MSR_EE; /* clear MSR_EE */	\
 	mtspr	SPRN_##_H##SRR1,r10;			\

From dedab7f0d3137441a97fe7cf9b9ca5dbd20ca9a5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 30 Jan 2018 15:11:44 +0000
Subject: [PATCH 407/676] ocxl: fix signed comparison with less than zero

Currently the comparison of used < 0 is always false because
uses is a size_t. Fix this by making used a ssize_t type.

Detected by Coccinelle:
drivers/misc/ocxl/file.c:320:6-10: WARNING: Unsigned expression
compared with zero: used < 0

Fixes: 5ef3166e8a32 ("ocxl: Driver code for 'generic' opencapi devices")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/ocxl/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index c90c1a578d2f1..1287e4430e6b2 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -277,7 +277,7 @@ static ssize_t afu_read(struct file *file, char __user *buf, size_t count,
 	struct ocxl_context *ctx = file->private_data;
 	struct ocxl_kernel_event_header header;
 	ssize_t rc;
-	size_t used = 0;
+	ssize_t used = 0;
 	DEFINE_WAIT(event_wait);
 
 	memset(&header, 0, sizeof(header));

From eeb715c3e995fbdda0cc05e61216c6c5609bce66 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 7 Feb 2018 11:20:02 +1000
Subject: [PATCH 408/676] powerpc/64s/radix: Boot-time NULL pointer protection
 using a guard-PID

This change restores and formalises the behaviour that access to NULL
or other user addresses by the kernel during boot should fault rather
than succeed and modify memory. This was inadvertently broken when
fixing another bug, because it was previously not well defined and
only worked by chance.

powerpc/64s/radix uses high address bits to select an address space
"quadrant", which determines which PID and LPID are used to translate
the rest of the address (effective PID, effective LPID). The kernel
mapping at 0xC... selects quadrant 3, which uses PID=0 and LPID=0. So
the kernel page tables are installed in the PID 0 process table entry.

An address at 0x0... selects quadrant 0, which uses PID=PIDR for
translating the rest of the address (that is, it uses the value of the
PIDR register as the effective PID). If PIDR=0, then the translation
is performed with the PID 0 process table entry page tables. This is
the kernel mapping, so we effectively get another copy of the kernel
address space at 0. A NULL pointer access will access physical memory
address 0.

To prevent duplicating the kernel address space in quadrant 0, this
patch allocates a guard PID containing no translations, and
initializes PIDR with this during boot, before the MMU is switched on.
Any kernel access to quadrant 0 will use this guard PID for
translation and find no valid mappings, and therefore fault.

After boot, this PID will be switchd away to user context PIDs, but
those contain user mappings (and usually NULL pointer protection)
rather than kernel mapping, which is much safer (and by design). It
may be in future this is tightened further, which the guard PID could
be used for.

Commit 371b8044 ("powerpc/64s: Initialize ISAv3 MMU registers before
setting partition table"), introduced this problem because it zeroes
PIDR at boot. However previously the value was inherited from firmware
or kexec, which is not robust and can be zero (e.g., mambo).

Fixes: 371b80447ff3 ("powerpc/64s: Initialize ISAv3 MMU registers before setting partition table")
Cc: stable@vger.kernel.org # v4.15+
Reported-by: Florian Weimer <fweimer@redhat.com>
Tested-by: Mauricio Faria de Oliveira <mauricfo@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 573a9a2ee4555..96e07d1f673d7 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -20,6 +20,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
 #include <asm/dma.h>
 #include <asm/machdep.h>
 #include <asm/mmu.h>
@@ -333,6 +334,22 @@ static void __init radix_init_pgtable(void)
 		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
 	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
+
+	/*
+	 * The init_mm context is given the first available (non-zero) PID,
+	 * which is the "guard PID" and contains no page table. PIDR should
+	 * never be set to zero because that duplicates the kernel address
+	 * space at the 0x0... offset (quadrant 0)!
+	 *
+	 * An arbitrary PID that may later be allocated by the PID allocator
+	 * for userspace processes must not be used either, because that
+	 * would cause stale user mappings for that PID on CPUs outside of
+	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
+	 *
+	 * So permanently carve out one PID for the purpose of a guard PID.
+	 */
+	init_mm.context.id = mmu_base_pid;
+	mmu_base_pid++;
 }
 
 static void __init radix_init_partition_table(void)
@@ -579,7 +596,8 @@ void __init radix__early_init_mmu(void)
 
 	radix_init_iamr();
 	radix_init_pgtable();
-
+	/* Switch to the guard PID before turning on MMU */
+	radix__switch_mmu_context(NULL, &init_mm);
 	if (cpu_has_feature(CPU_FTR_HVMODE))
 		tlbiel_all();
 }
@@ -604,6 +622,7 @@ void radix__early_init_mmu_secondary(void)
 	}
 	radix_init_iamr();
 
+	radix__switch_mmu_context(NULL, &init_mm);
 	if (cpu_has_feature(CPU_FTR_HVMODE))
 		tlbiel_all();
 }

From 4dd5f8a99e791a8c6500e3592f3ce81ae7edcde1 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Wed, 7 Feb 2018 17:35:51 +1100
Subject: [PATCH 409/676] powerpc/mm/radix: Split linear mapping on hot-unplug

This patch splits the linear mapping if the hot-unplug range is
smaller than the mapping size. The code detects if the mapping needs
to be split into a smaller size and if so, uses the stop machine
infrastructure to clear the existing mapping and then remap the
remaining range using a smaller page size.

The code will skip any region of the mapping that overlaps with kernel
text and warn about it once. We don't want to remove a mapping where
the kernel text and the LMB we intend to remove overlap in the same
TLB mapping as it may affect the currently executing code.

I've tested these changes under a kvm guest with 2 vcpus, from a split
mapping point of view, some of the caveats mentioned above applied to
the testing I did.

Fixes: 4b5d62ca17a1 ("powerpc/mm: add radix__remove_section_mapping()")
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[mpe: Tweak change log to match updated behaviour]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 95 +++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 96e07d1f673d7..328ff9abc3334 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -17,6 +17,7 @@
 #include <linux/of_fdt.h>
 #include <linux/mm.h>
 #include <linux/string_helpers.h>
+#include <linux/stop_machine.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -685,6 +686,30 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
 	pud_clear(pud);
 }
 
+struct change_mapping_params {
+	pte_t *pte;
+	unsigned long start;
+	unsigned long end;
+	unsigned long aligned_start;
+	unsigned long aligned_end;
+};
+
+static int stop_machine_change_mapping(void *data)
+{
+	struct change_mapping_params *params =
+			(struct change_mapping_params *)data;
+
+	if (!data)
+		return -1;
+
+	spin_unlock(&init_mm.page_table_lock);
+	pte_clear(&init_mm, params->aligned_start, params->pte);
+	create_physical_mapping(params->aligned_start, params->start);
+	create_physical_mapping(params->end, params->aligned_end);
+	spin_lock(&init_mm.page_table_lock);
+	return 0;
+}
+
 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 			     unsigned long end)
 {
@@ -713,6 +738,52 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 	}
 }
 
+/*
+ * clear the pte and potentially split the mapping helper
+ */
+static void split_kernel_mapping(unsigned long addr, unsigned long end,
+				unsigned long size, pte_t *pte)
+{
+	unsigned long mask = ~(size - 1);
+	unsigned long aligned_start = addr & mask;
+	unsigned long aligned_end = addr + size;
+	struct change_mapping_params params;
+	bool split_region = false;
+
+	if ((end - addr) < size) {
+		/*
+		 * We're going to clear the PTE, but not flushed
+		 * the mapping, time to remap and flush. The
+		 * effects if visible outside the processor or
+		 * if we are running in code close to the
+		 * mapping we cleared, we are in trouble.
+		 */
+		if (overlaps_kernel_text(aligned_start, addr) ||
+			overlaps_kernel_text(end, aligned_end)) {
+			/*
+			 * Hack, just return, don't pte_clear
+			 */
+			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
+				  "text, not splitting\n", addr, end);
+			return;
+		}
+		split_region = true;
+	}
+
+	if (split_region) {
+		params.pte = pte;
+		params.start = addr;
+		params.end = end;
+		params.aligned_start = addr & ~(size - 1);
+		params.aligned_end = min_t(unsigned long, aligned_end,
+				(unsigned long)__va(memblock_end_of_DRAM()));
+		stop_machine(stop_machine_change_mapping, &params, NULL);
+		return;
+	}
+
+	pte_clear(&init_mm, addr, pte);
+}
+
 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			     unsigned long end)
 {
@@ -728,13 +799,7 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			continue;
 
 		if (pmd_huge(*pmd)) {
-			if (!IS_ALIGNED(addr, PMD_SIZE) ||
-			    !IS_ALIGNED(next, PMD_SIZE)) {
-				WARN_ONCE(1, "%s: unaligned range\n", __func__);
-				continue;
-			}
-
-			pte_clear(&init_mm, addr, (pte_t *)pmd);
+			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
 			continue;
 		}
 
@@ -759,13 +824,7 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 			continue;
 
 		if (pud_huge(*pud)) {
-			if (!IS_ALIGNED(addr, PUD_SIZE) ||
-			    !IS_ALIGNED(next, PUD_SIZE)) {
-				WARN_ONCE(1, "%s: unaligned range\n", __func__);
-				continue;
-			}
-
-			pte_clear(&init_mm, addr, (pte_t *)pud);
+			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
 			continue;
 		}
 
@@ -791,13 +850,7 @@ static void remove_pagetable(unsigned long start, unsigned long end)
 			continue;
 
 		if (pgd_huge(*pgd)) {
-			if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
-			    !IS_ALIGNED(next, PGDIR_SIZE)) {
-				WARN_ONCE(1, "%s: unaligned range\n", __func__);
-				continue;
-			}
-
-			pte_clear(&init_mm, addr, (pte_t *)pgd);
+			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
 			continue;
 		}
 

From aece34cd576c7625181b0488a8129c1e165355f7 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 19 Jan 2018 16:40:48 +0100
Subject: [PATCH 410/676] dt-bindings: Document mti,mips-cpc binding

Document a binding for the MIPS Cluster Power Controller (CPC) that
allows the device tree to specify where the CPC registers are located.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Cc: linux-mips@linux-mips.org
Cc: devicetree@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/18512/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 Documentation/devicetree/bindings/power/mti,mips-cpc.txt | 8 ++++++++
 MAINTAINERS                                              | 1 +
 2 files changed, 9 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/power/mti,mips-cpc.txt

diff --git a/Documentation/devicetree/bindings/power/mti,mips-cpc.txt b/Documentation/devicetree/bindings/power/mti,mips-cpc.txt
new file mode 100644
index 0000000000000..c6b82511ae8a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/mti,mips-cpc.txt
@@ -0,0 +1,8 @@
+Binding for MIPS Cluster Power Controller (CPC).
+
+This binding allows a system to specify where the CPC registers are
+located.
+
+Required properties:
+compatible : Should be "mti,mips-cpc".
+regs: Should describe the address & size of the CPC register region.
diff --git a/MAINTAINERS b/MAINTAINERS
index 0e7561d6c2d77..ac100d38d117b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9112,6 +9112,7 @@ MIPS GENERIC PLATFORM
 M:	Paul Burton <paul.burton@mips.com>
 L:	linux-mips@linux-mips.org
 S:	Supported
+F:	Documentation/devicetree/bindings/power/mti,mips-cpc.txt
 F:	arch/mips/generic/
 F:	arch/mips/tools/generic-board-config.sh
 

From 791412dafbbfd860e78983d45cf71db603a82f67 Mon Sep 17 00:00:00 2001
From: Paul Burton <paul.burton@mips.com>
Date: Fri, 19 Jan 2018 16:40:49 +0100
Subject: [PATCH 411/676] MIPS: CPC: Map registers using DT in
 mips_cpc_default_phys_base()

Reading mips_cpc_base value from the DT allows each platform to
define it according to its needs. This is especially convenient
for MIPS_GENERIC kernel where this kind of information should be
determined in runtime.

Use mti,mips-cpc compatible string with just a reg property to
specify the register location for your platform.

Signed-off-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Miodrag Dinic <miodrag.dinic@mips.com>
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Cc: linux-mips@linux-mips.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Patchwork: https://patchwork.linux-mips.org/patch/18513/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kernel/mips-cpc.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/mips/kernel/mips-cpc.c b/arch/mips/kernel/mips-cpc.c
index 19c88d7700546..fcf9af492d602 100644
--- a/arch/mips/kernel/mips-cpc.c
+++ b/arch/mips/kernel/mips-cpc.c
@@ -10,6 +10,8 @@
 
 #include <linux/errno.h>
 #include <linux/percpu.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/spinlock.h>
 
 #include <asm/mips-cps.h>
@@ -22,6 +24,17 @@ static DEFINE_PER_CPU_ALIGNED(unsigned long, cpc_core_lock_flags);
 
 phys_addr_t __weak mips_cpc_default_phys_base(void)
 {
+	struct device_node *cpc_node;
+	struct resource res;
+	int err;
+
+	cpc_node = of_find_compatible_node(of_root, NULL, "mti,mips-cpc");
+	if (cpc_node) {
+		err = of_address_to_resource(cpc_node, 0, &res);
+		if (!err)
+			return res.start;
+	}
+
 	return 0;
 }
 

From e3ac6c0737e2b17bf11210e3fd66565e9b818b87 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:22 -0800
Subject: [PATCH 412/676] nfp: bpf: require ETH table

Upcoming changes will require all netdevs supporting TC offloads
to have a full struct nfp_port.  Require those for BPF offload.
The operation without management FW reporting information about
Ethernet ports is something we only support for very old and very
basic NIC firmwares anyway.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 322027792fe81..61898dda11cf5 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -35,6 +35,7 @@
 
 #include "../nfpcore/nfp_cpp.h"
 #include "../nfpcore/nfp_nffw.h"
+#include "../nfpcore/nfp_nsp.h"
 #include "../nfp_app.h"
 #include "../nfp_main.h"
 #include "../nfp_net.h"
@@ -87,9 +88,20 @@ static const char *nfp_bpf_extra_cap(struct nfp_app *app, struct nfp_net *nn)
 static int
 nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id)
 {
+	struct nfp_pf *pf = app->pf;
 	struct nfp_bpf_vnic *bv;
 	int err;
 
+	if (!pf->eth_tbl) {
+		nfp_err(pf->cpp, "No ETH table\n");
+		return -EINVAL;
+	}
+	if (pf->max_data_vnics != pf->eth_tbl->count) {
+		nfp_err(pf->cpp, "ETH entries don't match vNICs (%d vs %d)\n",
+			pf->max_data_vnics, pf->eth_tbl->count);
+		return -EINVAL;
+	}
+
 	bv = kzalloc(sizeof(*bv), GFP_KERNEL);
 	if (!bv)
 		return -ENOMEM;

From 0b9de4ca853b6ba2c92ff0b4602281001b166639 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:23 -0800
Subject: [PATCH 413/676] nfp: don't advertise hw-tc-offload on non-port
 netdevs

nfp_port is a structure which represents an ASIC port, both
PCIe vNIC (on a PF or a VF) or the external MAC port.  vNIC
netdev (struct nfp_net) and pure representor netdev (struct
nfp_repr) both have a pointer to this structure.  nfp_reprs
always have a port associated.  nfp_nets, however, only represent
a device port in legacy mode, where they are considered the
MAC port. In switchdev mode they are just the CPU's side of
the PCIe link.

By definition TC offloads only apply to device ports.  Don't
set the flag on vNICs without a port (i.e. in switchdev mode).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index c0fd351c86b10..fe77ea8b656c0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3734,7 +3734,7 @@ static void nfp_net_netdev_init(struct nfp_net *nn)
 
 	netdev->features = netdev->hw_features;
 
-	if (nfp_app_has_tc(nn->app))
+	if (nfp_app_has_tc(nn->app) && nn->port)
 		netdev->hw_features |= NETIF_F_HW_TC;
 
 	/* Advertise but disable TSO by default. */

From d692403e5cf8008f31f5664a6f3ce3e65d54f458 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:24 -0800
Subject: [PATCH 414/676] nfp: forbid disabling hw-tc-offload on representors
 while offload active

All netdevs which can accept TC offloads must implement
.ndo_set_features().  nfp_reprs currently do not do that, which
means hw-tc-offload can be turned on and off even when offloads
are active.

Whether the offloads are active is really a question to nfp_ports,
so remove the per-app tc_busy callback indirection thing, and
simply count the number of offloaded items in nfp_port structure.

Fixes: 8a2768732a4d ("nfp: provide infrastructure for offloading flower based TC filters")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Tested-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c  |  9 +--------
 .../ethernet/netronome/nfp/flower/offload.c    |  4 ++++
 drivers/net/ethernet/netronome/nfp/nfp_app.h   |  9 ---------
 .../ethernet/netronome/nfp/nfp_net_common.c    |  7 +++----
 .../net/ethernet/netronome/nfp/nfp_net_repr.c  |  1 +
 drivers/net/ethernet/netronome/nfp/nfp_port.c  | 18 ++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/nfp_port.h  |  6 ++++++
 7 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index 61898dda11cf5..34e98aa6b9562 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -182,6 +182,7 @@ static int nfp_bpf_setup_tc_block_cb(enum tc_setup_type type,
 		return err;
 
 	bv->tc_prog = cls_bpf->prog;
+	nn->port->tc_offload_cnt = !!bv->tc_prog;
 	return 0;
 }
 
@@ -219,13 +220,6 @@ static int nfp_bpf_setup_tc(struct nfp_app *app, struct net_device *netdev,
 	}
 }
 
-static bool nfp_bpf_tc_busy(struct nfp_app *app, struct nfp_net *nn)
-{
-	struct nfp_bpf_vnic *bv = nn->app_priv;
-
-	return !!bv->tc_prog;
-}
-
 static int
 nfp_bpf_change_mtu(struct nfp_app *app, struct net_device *netdev, int new_mtu)
 {
@@ -429,7 +423,6 @@ const struct nfp_app_type app_bpf = {
 	.ctrl_msg_rx	= nfp_bpf_ctrl_msg_rx,
 
 	.setup_tc	= nfp_bpf_setup_tc,
-	.tc_busy	= nfp_bpf_tc_busy,
 	.bpf		= nfp_ndo_bpf,
 	.xdp_offload	= nfp_bpf_xdp_offload,
 };
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index 08c4c6dc5f7f2..eb5c13dea8f59 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -349,6 +349,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 		       struct tc_cls_flower_offload *flow, bool egress)
 {
 	enum nfp_flower_tun_type tun_type = NFP_FL_TUNNEL_NONE;
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
 	struct nfp_flower_priv *priv = app->priv;
 	struct nfp_fl_payload *flow_pay;
 	struct nfp_fl_key_ls *key_layer;
@@ -390,6 +391,7 @@ nfp_flower_add_offload(struct nfp_app *app, struct net_device *netdev,
 	INIT_HLIST_NODE(&flow_pay->link);
 	flow_pay->tc_flower_cookie = flow->cookie;
 	hash_add_rcu(priv->flow_table, &flow_pay->link, flow->cookie);
+	port->tc_offload_cnt++;
 
 	/* Deallocate flow payload when flower rule has been destroyed. */
 	kfree(key_layer);
@@ -421,6 +423,7 @@ static int
 nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 		       struct tc_cls_flower_offload *flow)
 {
+	struct nfp_port *port = nfp_port_from_netdev(netdev);
 	struct nfp_fl_payload *nfp_flow;
 	int err;
 
@@ -442,6 +445,7 @@ nfp_flower_del_offload(struct nfp_app *app, struct net_device *netdev,
 
 err_free_flow:
 	hash_del_rcu(&nfp_flow->link);
+	port->tc_offload_cnt--;
 	kfree(nfp_flow->action_data);
 	kfree(nfp_flow->mask_data);
 	kfree(nfp_flow->unmasked_data);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h
index 437964afa8eef..20546ae679090 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_app.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h
@@ -92,7 +92,6 @@ extern const struct nfp_app_type app_flower;
  * @stop:	stop application logic
  * @ctrl_msg_rx:    control message handler
  * @setup_tc:	setup TC ndo
- * @tc_busy:	TC HW offload busy (rules loaded)
  * @bpf:	BPF ndo offload-related calls
  * @xdp_offload:    offload an XDP program
  * @eswitch_mode_get:    get SR-IOV eswitch mode
@@ -135,7 +134,6 @@ struct nfp_app_type {
 
 	int (*setup_tc)(struct nfp_app *app, struct net_device *netdev,
 			enum tc_setup_type type, void *type_data);
-	bool (*tc_busy)(struct nfp_app *app, struct nfp_net *nn);
 	int (*bpf)(struct nfp_app *app, struct nfp_net *nn,
 		   struct netdev_bpf *xdp);
 	int (*xdp_offload)(struct nfp_app *app, struct nfp_net *nn,
@@ -301,13 +299,6 @@ static inline bool nfp_app_has_tc(struct nfp_app *app)
 	return app && app->type->setup_tc;
 }
 
-static inline bool nfp_app_tc_busy(struct nfp_app *app, struct nfp_net *nn)
-{
-	if (!app || !app->type->tc_busy)
-		return false;
-	return app->type->tc_busy(app, nn);
-}
-
 static inline int nfp_app_setup_tc(struct nfp_app *app,
 				   struct net_device *netdev,
 				   enum tc_setup_type type, void *type_data)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index fe77ea8b656c0..19e989239af70 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3210,10 +3210,9 @@ static int nfp_net_set_features(struct net_device *netdev,
 			new_ctrl &= ~NFP_NET_CFG_CTRL_GATHER;
 	}
 
-	if (changed & NETIF_F_HW_TC && nfp_app_tc_busy(nn->app, nn)) {
-		nn_err(nn, "Cannot disable HW TC offload while in use\n");
-		return -EBUSY;
-	}
+	err = nfp_port_set_features(netdev, features);
+	if (err)
+		return err;
 
 	nn_dbg(nn, "Feature change 0x%llx -> 0x%llx (changed=0x%llx)\n",
 	       netdev->features, features, changed);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index f67da6bde9da3..619570524d2a4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -265,6 +265,7 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 	.ndo_set_vf_spoofchk	= nfp_app_set_vf_spoofchk,
 	.ndo_get_vf_config	= nfp_app_get_vf_config,
 	.ndo_set_vf_link_state	= nfp_app_set_vf_link_state,
+	.ndo_set_features	= nfp_port_set_features,
 };
 
 static void nfp_repr_clean(struct nfp_repr *repr)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.c b/drivers/net/ethernet/netronome/nfp/nfp_port.c
index 34a6e035fe9a2..7bd8be5c833b0 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/lockdep.h>
+#include <linux/netdevice.h>
 #include <net/switchdev.h>
 
 #include "nfpcore/nfp_cpp.h"
@@ -100,6 +101,23 @@ int nfp_port_setup_tc(struct net_device *netdev, enum tc_setup_type type,
 	return nfp_app_setup_tc(port->app, netdev, type, type_data);
 }
 
+int nfp_port_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct nfp_port *port;
+
+	port = nfp_port_from_netdev(netdev);
+	if (!port)
+		return 0;
+
+	if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
+	    port->tc_offload_cnt) {
+		netdev_err(netdev, "Cannot disable HW TC offload while offloads active\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
 struct nfp_port *
 nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id)
 {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h
index 21bd4aa326468..fa7e669a969c6 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_port.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h
@@ -72,6 +72,8 @@ enum nfp_port_flags {
  * @netdev:	backpointer to associated netdev
  * @type:	what port type does the entity represent
  * @flags:	port flags
+ * @tc_offload_cnt:	number of active TC offloads, how offloads are counted
+ *			is not defined, use as a boolean
  * @app:	backpointer to the app structure
  * @dl_port:	devlink port structure
  * @eth_id:	for %NFP_PORT_PHYS_PORT port ID in NFP enumeration scheme
@@ -87,6 +89,7 @@ struct nfp_port {
 	enum nfp_port_type type;
 
 	unsigned long flags;
+	unsigned long tc_offload_cnt;
 
 	struct nfp_app *app;
 
@@ -121,6 +124,9 @@ static inline bool nfp_port_is_vnic(const struct nfp_port *port)
 	return port->type == NFP_PORT_PF_PORT || port->type == NFP_PORT_VF_PORT;
 }
 
+int
+nfp_port_set_features(struct net_device *netdev, netdev_features_t features);
+
 struct nfp_port *nfp_port_from_netdev(struct net_device *netdev);
 struct nfp_port *
 nfp_port_from_id(struct nfp_pf *pf, enum nfp_port_type type, unsigned int id);

From 0d592e52fbbda26155b686dd93878cf774a3ab0e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:25 -0800
Subject: [PATCH 415/676] nfp: limit the number of TSO segments

Most FWs limit the number of TSO segments a frame can produce
to 64.  This is for fairness and efficiency (of FW datapath)
reasons.  If a frame with larger number of segments is submitted
the FW will drop it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h   | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 19e989239af70..a05be0ab27134 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3750,6 +3750,8 @@ static void nfp_net_netdev_init(struct nfp_net *nn)
 	netdev->min_mtu = ETH_MIN_MTU;
 	netdev->max_mtu = nn->max_mtu;
 
+	netdev->gso_max_segs = NFP_NET_LSO_MAX_SEGS;
+
 	netif_carrier_off(netdev);
 
 	nfp_net_set_ethtool_ops(netdev);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
index eeecef2caac6f..4499a73330784 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -59,9 +59,12 @@
 #define NFP_NET_RX_OFFSET               32
 
 /**
- * Maximum header size supported for LSO frames
+ * LSO parameters
+ * %NFP_NET_LSO_MAX_HDR_SZ:	Maximum header size supported for LSO frames
+ * %NFP_NET_LSO_MAX_SEGS:	Maximum number of segments LSO frame can produce
  */
 #define NFP_NET_LSO_MAX_HDR_SZ		255
+#define NFP_NET_LSO_MAX_SEGS		64
 
 /**
  * Prepend field types

From 1a5e8e35000577cb9100d22daa8b5ebcfa2be9b2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 7 Feb 2018 20:55:26 -0800
Subject: [PATCH 416/676] nfp: populate MODULE_VERSION

DKMS and similar out-of-tree module replacement services use
module version to make sure the out-of-tree software is not
older than the module shipped with the kernel.  We use the
kernel version in ethtool -i output, put it into MODULE_VERSION
as well.

Reported-by: Jan Gutter <jan.gutter@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index cc570bb6563c6..ab301d56430bc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -649,3 +649,4 @@ MODULE_FIRMWARE("netronome/nic_AMDA0099-0001_2x25.nffw");
 MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("The Netronome Flow Processor (NFP) driver.");
+MODULE_VERSION(UTS_RELEASE);

From 7b6586562708d2b3a04fe49f217ddbadbbbb0546 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 5 Feb 2018 22:05:31 -0500
Subject: [PATCH 417/676] ftrace: Remove incorrect setting of glob search field

__unregister_ftrace_function_probe() will incorrectly parse the glob filter
because it resets the search variable that was setup by filter_parse_regex().

Al Viro reported this:

    After that call of filter_parse_regex() we could have func_g.search not
    equal to glob only if glob started with '!' or '*'.  In the former case
    we would've buggered off with -EINVAL (not = 1).  In the latter we
    would've set func_g.search equal to glob + 1, calculated the length of
    that thing in func_g.len and proceeded to reset func_g.search back to
    glob.

    Suppose the glob is e.g. *foo*.  We end up with
	    func_g.type = MATCH_MIDDLE_ONLY;
	    func_g.len = 3;
	    func_g.search = "*foo";
    Feeding that to ftrace_match_record() will not do anything sane - we
    will be looking for names containing "*foo" (->len is ignored for that
    one).

Link: http://lkml.kernel.org/r/20180127031706.GE13338@ZenIV.linux.org.uk

Cc: stable@vger.kernel.org
Fixes: 3ba009297149f ("ftrace: Introduce ftrace_glob structure")
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dabd9d167d425..eac9ce2c57a2e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4456,7 +4456,6 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 		func_g.type = filter_parse_regex(glob, strlen(glob),
 						 &func_g.search, &not);
 		func_g.len = strlen(func_g.search);
-		func_g.search = glob;
 
 		/* we do not support '!' for function probes */
 		if (WARN_ON(not))

From 07234021410bbc27b7c86c18de98616c29fbe667 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 5 Feb 2018 22:18:11 -0500
Subject: [PATCH 418/676] tracing: Fix parsing of globs with a wildcard at the
 beginning

Al Viro reported:

    For substring - sure, but what about something like "*a*b" and "a*b"?
    AFAICS, filter_parse_regex() ends up with identical results in both
    cases - MATCH_GLOB and *search = "a*b".  And no way for the caller
    to tell one from another.

Testing this with the following:

 # cd /sys/kernel/tracing
 # echo '*raw*lock' > set_ftrace_filter
 bash: echo: write error: Invalid argument

With this patch:

 # echo '*raw*lock' > set_ftrace_filter
 # cat set_ftrace_filter
_raw_read_trylock
_raw_write_trylock
_raw_read_unlock
_raw_spin_unlock
_raw_write_unlock
_raw_spin_trylock
_raw_spin_lock
_raw_write_lock
_raw_read_lock

Al recommended not setting the search buffer to skip the first '*' unless we
know we are not using MATCH_GLOB. This implements his suggested logic.

Link: http://lkml.kernel.org/r/20180127170748.GF13338@ZenIV.linux.org.uk

Cc: stable@vger.kernel.org
Fixes: 60f1d5e3bac44 ("ftrace: Support full glob matching")
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Suggsted-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 61e7f0678d335..a764aec3c9a17 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -400,7 +400,6 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
 			if (!i) {
-				*search = buff + 1;
 				type = MATCH_END_ONLY;
 			} else if (i == len - 1) {
 				if (type == MATCH_END_ONLY)
@@ -410,14 +409,14 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
 				buff[i] = 0;
 				break;
 			} else {	/* pattern continues, use full glob */
-				type = MATCH_GLOB;
-				break;
+				return MATCH_GLOB;
 			}
 		} else if (strchr("[?\\", buff[i])) {
-			type = MATCH_GLOB;
-			break;
+			return MATCH_GLOB;
 		}
 	}
+	if (buff[0] == '*')
+		*search = buff + 1;
 
 	return type;
 }

From df9d36d9e8adefe0975c7b5888967651b3db3af3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 6 Feb 2018 17:10:12 -0500
Subject: [PATCH 419/676] selftests/ftrace: Have reset_ftrace_filter handle
 modules

If a function probe in set_ftrace_filter belongs to a module, it will
contain the module name. Like:

 wmi_query_block [wmi]:traceoff:unlimited

But writing:

 '!wmi_query_block [wmi]:traceoff' > set_ftrace_filter

will cause an error. We still need to write:

 '!wmi_query_block:traceoff' > set_ftrace_filter

Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/testing/selftests/ftrace/test.d/functions | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index f2019b37370d3..e7c4c7b752a25 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -37,17 +37,18 @@ reset_ftrace_filter() { # reset all triggers in set_ftrace_filter
 	if [ "$tr" = "" ]; then
 	    continue
 	fi
+	name=`echo $t | cut -d: -f1 | cut -d' ' -f1`
 	if [ $tr = "enable_event" -o $tr = "disable_event" ]; then
-	    tr=`echo $t | cut -d: -f1-4`
+	    tr=`echo $t | cut -d: -f2-4`
 	    limit=`echo $t | cut -d: -f5`
 	else
-	    tr=`echo $t | cut -d: -f1-2`
+	    tr=`echo $t | cut -d: -f2`
 	    limit=`echo $t | cut -d: -f3`
 	fi
 	if [ "$limit" != "unlimited" ]; then
 	    tr="$tr:$limit"
 	fi
-	echo "!$tr" > set_ftrace_filter
+	echo "!$name:$tr" > set_ftrace_filter
     done
 }
 

From 0787ce336075ac0fa2d356de4e3c2a2488e851a6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 6 Feb 2018 17:15:02 -0500
Subject: [PATCH 420/676] selftests/ftrace: Have reset_ftrace_filter handle
 multiple instances

If a probe is attached to a static function that is in multiple files with
the same name, removing it by name will remove all instances:

 # grep jump_label_unlock set_ftrace_filter
jump_label_unlock:traceoff:unlimited
jump_label_unlock:traceoff:unlimited

 # echo '!jump_label_unlock:traceoff' >> set_ftrace_filter
 # grep jump_label_unlock set_ftrace_filter
 #

But the loop in reset_ftrace_filter will try to remove multiple instances
multiple times. If this happens the second time will error and cause the
test to fail.

At each iteration of the loop, check to see if the probe being removed still
exists.

Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/testing/selftests/ftrace/test.d/functions | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index e7c4c7b752a25..df3dd7fe5f9b2 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -37,6 +37,9 @@ reset_ftrace_filter() { # reset all triggers in set_ftrace_filter
 	if [ "$tr" = "" ]; then
 	    continue
 	fi
+	if ! grep -q "$t" set_ftrace_filter; then
+		continue;
+	fi
 	name=`echo $t | cut -d: -f1 | cut -d' ' -f1`
 	if [ $tr = "enable_event" -o $tr = "disable_event" ]; then
 	    tr=`echo $t | cut -d: -f2-4`

From 97fe22adf33f06519bfdf7dad33bcd562e366c8f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 6 Feb 2018 17:19:03 -0500
Subject: [PATCH 421/676] selftests/ftrace: Add some missing glob checks

Al Viro discovered a bug in the glob ftrace filtering code where "*a*b" is
treated the same as "a*b", and functions that would be selected by "*a*b"
but not "a*b" are not selected with "*a*b".

Add tests for patterns "*a*b" and "a*b*" to the glob selftest.

Link: http://lkml.kernel.org/r/20180127170748.GF13338@ZenIV.linux.org.uk

Cc: Shuah Khan <shuah@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../selftests/ftrace/test.d/ftrace/func-filter-glob.tc      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc
index 589d52b211b7c..27a54a17da65d 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc
@@ -29,6 +29,12 @@ ftrace_filter_check '*schedule*' '^.*schedule.*$'
 # filter by *, end match
 ftrace_filter_check 'schedule*' '^schedule.*$'
 
+# filter by *mid*end
+ftrace_filter_check '*aw*lock' '.*aw.*lock$'
+
+# filter by start*mid*
+ftrace_filter_check 'mutex*try*' '^mutex.*try.*'
+
 # Advanced full-glob matching feature is recently supported.
 # Skip the tests if we are sure the kernel does not support it.
 if grep -q 'accepts: .* glob-matching-pattern' README ; then

From 878cb3fb06c67e8f1a452346e0bc6bb85f29b0a0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 6 Feb 2018 17:28:36 -0500
Subject: [PATCH 422/676] selftests/ftrace: Add more tests for removing of
 function probes

Al Viro discovered a bug in the removing of function probes where if it had
a '*' at the beginning, it would fail to find any matches. That is, because
it reset the glob search string to the the initial string with a "MATCH_END"
type, instead of skipping the wildcard "*" it included it, where it would
not match any functions because "*" was being treated as a normal character
and not a wildcard one.

Link: http://lkml.kernel.org/r/20180127031706.GE13338@ZenIV.linux.org.uk

Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 .../test.d/ftrace/func_set_ftrace_file.tc     | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
index 0f3f92622e335..68e7a48f5828e 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -128,6 +128,43 @@ if check_set_ftrace_filter "$FUNC1" "$FUNC2" ; then
     fail "Expected $FUNC1 and $FUNC2"
 fi
 
+test_actual() { # Compares $TMPDIR/expected with set_ftrace_filter
+    cat set_ftrace_filter | grep -v '#' | cut -d' ' -f1 | cut -d':' -f1 | sort -u > $TMPDIR/actual
+    DIFF=`diff $TMPDIR/actual $TMPDIR/expected`
+    test -z "$DIFF"
+}
+
+# Set traceoff trigger for all fuctions with "lock" in their name
+cat available_filter_functions | cut -d' ' -f1 |  grep 'lock' | sort -u > $TMPDIR/expected
+echo '*lock*:traceoff' > set_ftrace_filter
+test_actual
+
+# now remove all with 'try' in it, and end with lock
+grep -v 'try.*lock$' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!*try*lock:traceoff' >> set_ftrace_filter
+test_actual
+
+# remove all that start with "m" and end with "lock"
+grep -v '^m.*lock$' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!m*lock:traceoff' >> set_ftrace_filter
+test_actual
+
+# remove all that start with "c" and have "unlock"
+grep -v '^c.*unlock' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!c*unlock*:traceoff' >> set_ftrace_filter
+test_actual
+
+# clear all the rest
+> $TMPDIR/expected
+echo '!*:traceoff' >> set_ftrace_filter
+test_actual
+
+rm $TMPDIR/expected
+rm $TMPDIR/actual
+
 do_reset
 
 exit 0

From ad6a0a52e6de3d1161b7999c7903db906ba4cf79 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Wed, 31 Jan 2018 18:31:24 +0200
Subject: [PATCH 423/676] nvme: rename NVME_CTRL_RECONNECTING state to
 NVME_CTRL_CONNECTING

In pci transport, this state is used to mark the initialization
process. This should be also used in other transports as well.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c    | 10 +++++-----
 drivers/nvme/host/fabrics.h |  9 +++++----
 drivers/nvme/host/fc.c      | 14 +++++++-------
 drivers/nvme/host/nvme.h    |  2 +-
 drivers/nvme/host/pci.c     |  8 ++++----
 drivers/nvme/host/rdma.c    |  6 +++---
 6 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f431c32774f36..1033de4136e0e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -265,7 +265,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	switch (new_state) {
 	case NVME_CTRL_ADMIN_ONLY:
 		switch (old_state) {
-		case NVME_CTRL_RECONNECTING:
+		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -276,7 +276,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		switch (old_state) {
 		case NVME_CTRL_NEW:
 		case NVME_CTRL_RESETTING:
-		case NVME_CTRL_RECONNECTING:
+		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -294,7 +294,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 			break;
 		}
 		break;
-	case NVME_CTRL_RECONNECTING:
+	case NVME_CTRL_CONNECTING:
 		switch (old_state) {
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_RESETTING:
@@ -309,7 +309,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_ADMIN_ONLY:
 		case NVME_CTRL_RESETTING:
-		case NVME_CTRL_RECONNECTING:
+		case NVME_CTRL_CONNECTING:
 			changed = true;
 			/* FALLTHRU */
 		default:
@@ -2687,7 +2687,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
 		[NVME_CTRL_LIVE]	= "live",
 		[NVME_CTRL_ADMIN_ONLY]	= "only-admin",
 		[NVME_CTRL_RESETTING]	= "resetting",
-		[NVME_CTRL_RECONNECTING]= "reconnecting",
+		[NVME_CTRL_CONNECTING]	= "connecting",
 		[NVME_CTRL_DELETING]	= "deleting",
 		[NVME_CTRL_DEAD]	= "dead",
 	};
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 25b19f722f5b2..a3145d90c1d2c 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -171,13 +171,14 @@ static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl,
 	    cmd->common.opcode != nvme_fabrics_command ||
 	    cmd->fabrics.fctype != nvme_fabrics_type_connect) {
 		/*
-		 * Reconnecting state means transport disruption, which can take
-		 * a long time and even might fail permanently, fail fast to
-		 * give upper layers a chance to failover.
+		 * Connecting state means transport disruption or initial
+		 * establishment, which can take a long time and even might
+		 * fail permanently, fail fast to give upper layers a chance
+		 * to failover.
 		 * Deleting state means that the ctrl will never accept commands
 		 * again, fail it permanently.
 		 */
-		if (ctrl->state == NVME_CTRL_RECONNECTING ||
+		if (ctrl->state == NVME_CTRL_CONNECTING ||
 		    ctrl->state == NVME_CTRL_DELETING) {
 			nvme_req(rq)->status = NVME_SC_ABORT_REQ;
 			return BLK_STS_IOERR;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b856d7c919d29..e2df22d56b2ab 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -532,7 +532,7 @@ nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl)
 {
 	switch (ctrl->ctrl.state) {
 	case NVME_CTRL_NEW:
-	case NVME_CTRL_RECONNECTING:
+	case NVME_CTRL_CONNECTING:
 		/*
 		 * As all reconnects were suppressed, schedule a
 		 * connect.
@@ -777,7 +777,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
 		}
 		break;
 
-	case NVME_CTRL_RECONNECTING:
+	case NVME_CTRL_CONNECTING:
 		/*
 		 * The association has already been terminated and the
 		 * controller is attempting reconnects.  No need to do anything
@@ -1722,7 +1722,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	if (status &&
 	    (blk_queue_dying(rq->q) ||
 	     ctrl->ctrl.state == NVME_CTRL_NEW ||
-	     ctrl->ctrl.state == NVME_CTRL_RECONNECTING))
+	     ctrl->ctrl.state == NVME_CTRL_CONNECTING))
 		status |= cpu_to_le16(NVME_SC_DNR << 1);
 
 	if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
@@ -2943,7 +2943,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
 	unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ;
 	bool recon = true;
 
-	if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING)
+	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING)
 		return;
 
 	if (portptr->port_state == FC_OBJSTATE_ONLINE)
@@ -2991,10 +2991,10 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 	/* will block will waiting for io to terminate */
 	nvme_fc_delete_association(ctrl);
 
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 		dev_err(ctrl->ctrl.device,
 			"NVME-FC{%d}: error_recovery: Couldn't change state "
-			"to RECONNECTING\n", ctrl->cnum);
+			"to CONNECTING\n", ctrl->cnum);
 		return;
 	}
 
@@ -3195,7 +3195,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	 * transport errors (frame drop, LS failure) inherently must kill
 	 * the association. The transport is coded so that any command used
 	 * to create the association (prior to a LIVE state transition
-	 * while NEW or RECONNECTING) will fail if it completes in error or
+	 * while NEW or CONNECTING) will fail if it completes in error or
 	 * times out.
 	 *
 	 * As such: as the connect request was mostly likely due to a
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8e4550fa08f8b..27e31c00b306f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -123,7 +123,7 @@ enum nvme_ctrl_state {
 	NVME_CTRL_LIVE,
 	NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
 	NVME_CTRL_RESETTING,
-	NVME_CTRL_RECONNECTING,
+	NVME_CTRL_CONNECTING,
 	NVME_CTRL_DELETING,
 	NVME_CTRL_DEAD,
 };
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6fe7af00a1f42..ab9c19525fa80 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1141,7 +1141,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
 	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
 	switch (dev->ctrl.state) {
 	case NVME_CTRL_RESETTING:
-	case NVME_CTRL_RECONNECTING:
+	case NVME_CTRL_CONNECTING:
 		return false;
 	default:
 		break;
@@ -2288,12 +2288,12 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_dev_disable(dev, false);
 
 	/*
-	 * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
 	 * initializing procedure here.
 	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
 		dev_warn(dev->ctrl.device,
-			"failed to mark controller RECONNECTING\n");
+			"failed to mark controller CONNECTING\n");
 		goto out;
 	}
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2bc059f7d73c7..050eaa24cc7d9 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -887,7 +887,7 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 {
 	/* If we are resetting/deleting then do nothing */
-	if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
+	if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
 		WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
 			ctrl->ctrl.state == NVME_CTRL_LIVE);
 		return;
@@ -973,7 +973,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 	blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 	nvme_start_queues(&ctrl->ctrl);
 
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure should never happen */
 		WARN_ON_ONCE(1);
 		return;
@@ -1756,7 +1756,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	nvme_stop_ctrl(&ctrl->ctrl);
 	nvme_rdma_shutdown_ctrl(ctrl, false);
 
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure should never happen */
 		WARN_ON_ONCE(1);
 		return;

From b754a32c66772e6510acd92237aadd4cf227ae39 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Wed, 31 Jan 2018 18:31:25 +0200
Subject: [PATCH 424/676] nvme-rdma: use NVME_CTRL_CONNECTING state to mark
 init process

In order to avoid concurrent error recovery during initialization
process (allowed by the NVME_CTRL_NEW --> NVME_CTRL_RESETTING transition)
we must mark the ctrl as CONNECTING before initial connection
establisment.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 1 +
 drivers/nvme/host/rdma.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1033de4136e0e..86dca2919e191 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -296,6 +296,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		break;
 	case NVME_CTRL_CONNECTING:
 		switch (old_state) {
+		case NVME_CTRL_NEW:
 		case NVME_CTRL_LIVE:
 		case NVME_CTRL_RESETTING:
 			changed = true;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 050eaa24cc7d9..5e2cc4f0d207f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1942,6 +1942,9 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	if (!ctrl->queues)
 		goto out_uninit_ctrl;
 
+	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
+	WARN_ON_ONCE(!changed);
+
 	ret = nvme_rdma_configure_admin_queue(ctrl, true);
 	if (ret)
 		goto out_kfree_queues;

From 3096a739d2ccbfd6a626e388228a16558f76d79d Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Wed, 31 Jan 2018 18:31:26 +0200
Subject: [PATCH 425/676] nvme: delete NVME_CTRL_LIVE --> NVME_CTRL_CONNECTING
 transition

There is no logical reason to move from live state to connecting
state. In case of initial connection establishment, the transition
should be NVME_CTRL_NEW --> NVME_CTRL_CONNECTING --> NVME_CTRL_LIVE.
In case of error recovery or reset, the transition should be
NVME_CTRL_LIVE --> NVME_CTRL_RESETTING --> NVME_CTRL_CONNECTING -->
NVME_CTRL_LIVE.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 86dca2919e191..1f9278364196e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -297,7 +297,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 	case NVME_CTRL_CONNECTING:
 		switch (old_state) {
 		case NVME_CTRL_NEW:
-		case NVME_CTRL_LIVE:
 		case NVME_CTRL_RESETTING:
 			changed = true;
 			/* FALLTHRU */

From 8cb6af7b3a6d47f95ecb461a3f8d39cf6a64e4ae Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 31 Jan 2018 17:01:58 -0700
Subject: [PATCH 426/676] nvme: Fix discard buffer overrun

This patch checks the discard range array bounds before setting it in
case the driver gets a badly formed request.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1f9278364196e..2fd8688cfa474 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -518,9 +518,11 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
 		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
 
-		range[n].cattr = cpu_to_le32(0);
-		range[n].nlb = cpu_to_le32(nlb);
-		range[n].slba = cpu_to_le64(slba);
+		if (n < segments) {
+			range[n].cattr = cpu_to_le32(0);
+			range[n].nlb = cpu_to_le32(nlb);
+			range[n].slba = cpu_to_le64(slba);
+		}
 		n++;
 	}
 

From 8c2f826dc36314059ac146c78d3bf8056b626446 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Feb 2018 15:59:07 +0000
Subject: [PATCH 427/676] rxrpc: Don't put crypto buffers on the stack

Don't put buffers of data to be handed to crypto on the stack as this may
cause an assertion failure in the kernel (see below).  Fix this by using an
kmalloc'd buffer instead.

kernel BUG at ./include/linux/scatterlist.h:147!
...
RIP: 0010:rxkad_encrypt_response.isra.6+0x191/0x1b0 [rxrpc]
RSP: 0018:ffffbe2fc06cfca8 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff989277d59900 RCX: 0000000000000028
RDX: 0000259dc06cfd88 RSI: 0000000000000025 RDI: ffffbe30406cfd88
RBP: ffffbe2fc06cfd60 R08: ffffbe2fc06cfd08 R09: ffffbe2fc06cfd08
R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff7c5f80d9f95
R13: ffffbe2fc06cfd88 R14: ffff98927a3f7aa0 R15: ffffbe2fc06cfd08
FS:  0000000000000000(0000) GS:ffff98927fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055b1ff28f0f8 CR3: 000000001b412003 CR4: 00000000003606f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 rxkad_respond_to_challenge+0x297/0x330 [rxrpc]
 rxrpc_process_connection+0xd1/0x690 [rxrpc]
 ? process_one_work+0x1c3/0x680
 ? __lock_is_held+0x59/0xa0
 process_one_work+0x249/0x680
 worker_thread+0x3a/0x390
 ? process_one_work+0x680/0x680
 kthread+0x121/0x140
 ? kthread_create_worker_on_cpu+0x70/0x70
 ret_from_fork+0x3a/0x50

Reported-by: Jonathan Billings <jsbillings@jsbillings.org>
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jonathan Billings <jsbillings@jsbillings.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/conn_event.c |  1 +
 net/rxrpc/rxkad.c      | 92 +++++++++++++++++++++++-------------------
 2 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 4ca11be6be3ca..b1dfae1074310 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -460,6 +460,7 @@ void rxrpc_process_connection(struct work_struct *work)
 		case -EKEYEXPIRED:
 		case -EKEYREJECTED:
 			goto protocol_error;
+		case -ENOMEM:
 		case -EAGAIN:
 			goto requeue_and_leave;
 		case -ECONNABORTED:
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index c38b3a1de56c1..77cb23c7bd0a8 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -773,8 +773,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 {
 	const struct rxrpc_key_token *token;
 	struct rxkad_challenge challenge;
-	struct rxkad_response resp
-		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxkad_response *resp;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	const char *eproto;
 	u32 version, nonce, min_level, abort_code;
@@ -818,26 +817,29 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
 	token = conn->params.key->payload.data[0];
 
 	/* build the response packet */
-	memset(&resp, 0, sizeof(resp));
-
-	resp.version			= htonl(RXKAD_VERSION);
-	resp.encrypted.epoch		= htonl(conn->proto.epoch);
-	resp.encrypted.cid		= htonl(conn->proto.cid);
-	resp.encrypted.securityIndex	= htonl(conn->security_ix);
-	resp.encrypted.inc_nonce	= htonl(nonce + 1);
-	resp.encrypted.level		= htonl(conn->params.security_level);
-	resp.kvno			= htonl(token->kad->kvno);
-	resp.ticket_len			= htonl(token->kad->ticket_len);
-
-	resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter);
-	resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter);
-	resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter);
-	resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter);
+	resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
+	if (!resp)
+		return -ENOMEM;
+
+	resp->version			= htonl(RXKAD_VERSION);
+	resp->encrypted.epoch		= htonl(conn->proto.epoch);
+	resp->encrypted.cid		= htonl(conn->proto.cid);
+	resp->encrypted.securityIndex	= htonl(conn->security_ix);
+	resp->encrypted.inc_nonce	= htonl(nonce + 1);
+	resp->encrypted.level		= htonl(conn->params.security_level);
+	resp->kvno			= htonl(token->kad->kvno);
+	resp->ticket_len		= htonl(token->kad->ticket_len);
+	resp->encrypted.call_id[0]	= htonl(conn->channels[0].call_counter);
+	resp->encrypted.call_id[1]	= htonl(conn->channels[1].call_counter);
+	resp->encrypted.call_id[2]	= htonl(conn->channels[2].call_counter);
+	resp->encrypted.call_id[3]	= htonl(conn->channels[3].call_counter);
 
 	/* calculate the response checksum and then do the encryption */
-	rxkad_calc_response_checksum(&resp);
-	rxkad_encrypt_response(conn, &resp, token->kad);
-	return rxkad_send_response(conn, &sp->hdr, &resp, token->kad);
+	rxkad_calc_response_checksum(resp);
+	rxkad_encrypt_response(conn, resp, token->kad);
+	ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad);
+	kfree(resp);
+	return ret;
 
 protocol_error:
 	trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto);
@@ -1048,8 +1050,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 				 struct sk_buff *skb,
 				 u32 *_abort_code)
 {
-	struct rxkad_response response
-		__attribute__((aligned(8))); /* must be aligned for crypto */
+	struct rxkad_response *response;
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	struct rxrpc_crypt session_key;
 	const char *eproto;
@@ -1061,17 +1062,22 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 
 	_enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key));
 
+	ret = -ENOMEM;
+	response = kzalloc(sizeof(struct rxkad_response), GFP_NOFS);
+	if (!response)
+		goto temporary_error;
+
 	eproto = tracepoint_string("rxkad_rsp_short");
 	abort_code = RXKADPACKETSHORT;
 	if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header),
-			  &response, sizeof(response)) < 0)
+			  response, sizeof(*response)) < 0)
 		goto protocol_error;
-	if (!pskb_pull(skb, sizeof(response)))
+	if (!pskb_pull(skb, sizeof(*response)))
 		BUG();
 
-	version = ntohl(response.version);
-	ticket_len = ntohl(response.ticket_len);
-	kvno = ntohl(response.kvno);
+	version = ntohl(response->version);
+	ticket_len = ntohl(response->ticket_len);
+	kvno = ntohl(response->kvno);
 	_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
 	       sp->hdr.serial, version, kvno, ticket_len);
 
@@ -1105,31 +1111,31 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key,
 				   &expiry, _abort_code);
 	if (ret < 0)
-		goto temporary_error_free;
+		goto temporary_error_free_resp;
 
 	/* use the session key from inside the ticket to decrypt the
 	 * response */
-	rxkad_decrypt_response(conn, &response, &session_key);
+	rxkad_decrypt_response(conn, response, &session_key);
 
 	eproto = tracepoint_string("rxkad_rsp_param");
 	abort_code = RXKADSEALEDINCON;
-	if (ntohl(response.encrypted.epoch) != conn->proto.epoch)
+	if (ntohl(response->encrypted.epoch) != conn->proto.epoch)
 		goto protocol_error_free;
-	if (ntohl(response.encrypted.cid) != conn->proto.cid)
+	if (ntohl(response->encrypted.cid) != conn->proto.cid)
 		goto protocol_error_free;
-	if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
+	if (ntohl(response->encrypted.securityIndex) != conn->security_ix)
 		goto protocol_error_free;
-	csum = response.encrypted.checksum;
-	response.encrypted.checksum = 0;
-	rxkad_calc_response_checksum(&response);
+	csum = response->encrypted.checksum;
+	response->encrypted.checksum = 0;
+	rxkad_calc_response_checksum(response);
 	eproto = tracepoint_string("rxkad_rsp_csum");
-	if (response.encrypted.checksum != csum)
+	if (response->encrypted.checksum != csum)
 		goto protocol_error_free;
 
 	spin_lock(&conn->channel_lock);
 	for (i = 0; i < RXRPC_MAXCALLS; i++) {
 		struct rxrpc_call *call;
-		u32 call_id = ntohl(response.encrypted.call_id[i]);
+		u32 call_id = ntohl(response->encrypted.call_id[i]);
 
 		eproto = tracepoint_string("rxkad_rsp_callid");
 		if (call_id > INT_MAX)
@@ -1153,12 +1159,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 
 	eproto = tracepoint_string("rxkad_rsp_seq");
 	abort_code = RXKADOUTOFSEQUENCE;
-	if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
+	if (ntohl(response->encrypted.inc_nonce) != conn->security_nonce + 1)
 		goto protocol_error_free;
 
 	eproto = tracepoint_string("rxkad_rsp_level");
 	abort_code = RXKADLEVELFAIL;
-	level = ntohl(response.encrypted.level);
+	level = ntohl(response->encrypted.level);
 	if (level > RXRPC_SECURITY_ENCRYPT)
 		goto protocol_error_free;
 	conn->params.security_level = level;
@@ -1168,9 +1174,10 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 	 * as for a client connection */
 	ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno);
 	if (ret < 0)
-		goto temporary_error_free;
+		goto temporary_error_free_ticket;
 
 	kfree(ticket);
+	kfree(response);
 	_leave(" = 0");
 	return 0;
 
@@ -1179,12 +1186,15 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
 protocol_error_free:
 	kfree(ticket);
 protocol_error:
+	kfree(response);
 	trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto);
 	*_abort_code = abort_code;
 	return -EPROTO;
 
-temporary_error_free:
+temporary_error_free_ticket:
 	kfree(ticket);
+temporary_error_free_resp:
+	kfree(response);
 temporary_error:
 	/* Ignore the response packet if we got a temporary error such as
 	 * ENOMEM.  We just want to send the challenge again.  Note that we

From cb9f7a9a5c96a773bbc9c70660dc600cfff82f82 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 6 Feb 2018 14:48:32 +0100
Subject: [PATCH 428/676] netlink: ensure to loop over all netns in
 genlmsg_multicast_allns()

Nowadays, nlmsg_multicast() returns only 0 or -ESRCH but this was not the
case when commit 134e63756d5f was pushed.
However, there was no reason to stop the loop if a netns does not have
listeners.
Returns -ESRCH only if there was no listeners in all netns.

To avoid having the same problem in the future, I didn't take the
assumption that nlmsg_multicast() returns only 0 or -ESRCH.

Fixes: 134e63756d5f ("genetlink: make netns aware")
CC: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/genetlink.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index d444daf1ac045..6f02499ef0072 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1081,6 +1081,7 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 {
 	struct sk_buff *tmp;
 	struct net *net, *prev = NULL;
+	bool delivered = false;
 	int err;
 
 	for_each_net_rcu(net) {
@@ -1092,14 +1093,21 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group,
 			}
 			err = nlmsg_multicast(prev->genl_sock, tmp,
 					      portid, group, flags);
-			if (err)
+			if (!err)
+				delivered = true;
+			else if (err != -ESRCH)
 				goto error;
 		}
 
 		prev = net;
 	}
 
-	return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
+	err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
+	if (!err)
+		delivered = true;
+	else if (err != -ESRCH)
+		goto error;
+	return delivered ? 0 : -ESRCH;
  error:
 	kfree_skb(skb);
 	return err;

From cb67ab2cd2b8abd9650292c986c79901e3073a59 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 6 Feb 2018 09:34:42 +0900
Subject: [PATCH 429/676] kconfig: do not write choice values when their
 dependency becomes n

"# CONFIG_... is not set" for choice values are wrongly written into
the .config file if they are once visible, then become invisible later.

  Test case
  ---------

---------------------------(Kconfig)----------------------------
config A
	bool "A"

choice
	prompt "Choice ?"
	depends on A

config CHOICE_B
	bool "Choice B"

config CHOICE_C
	bool "Choice C"

endchoice
----------------------------------------------------------------

---------------------------(.config)----------------------------
CONFIG_A=y
----------------------------------------------------------------

With the Kconfig and .config above,

  $ make config
  scripts/kconfig/conf  --oldaskconfig Kconfig
  *
  * Linux Kernel Configuration
  *
  A (A) [Y/n] n
  #
  # configuration written to .config
  #
  $ cat .config
  #
  # Automatically generated file; DO NOT EDIT.
  # Linux Kernel Configuration
  #
  # CONFIG_A is not set
  # CONFIG_CHOICE_B is not set
  # CONFIG_CHOICE_C is not set

Here,

  # CONFIG_CHOICE_B is not set
  # CONFIG_CHOICE_C is not set

should not be written into the .config file because their dependency
"depends on A" is unmet.

Currently, there is no code that clears SYMBOL_WRITE of choice values.

Clear SYMBOL_WRITE for all symbols in sym_calc_value(), then set it
again after calculating visibility.  To simplify the logic, set the
flag if they have non-n visibility, regardless of types, and regardless
of whether they are choice values or not.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/symbol.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index c9123ed2b791a..13f7fdfe328dc 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -371,11 +371,13 @@ void sym_calc_value(struct symbol *sym)
 		sym->curr.tri = no;
 		return;
 	}
-	if (!sym_is_choice_value(sym))
-		sym->flags &= ~SYMBOL_WRITE;
+	sym->flags &= ~SYMBOL_WRITE;
 
 	sym_calc_visibility(sym);
 
+	if (sym->visible != no)
+		sym->flags |= SYMBOL_WRITE;
+
 	/* set default if recursively called */
 	sym->curr = newval;
 
@@ -390,7 +392,6 @@ void sym_calc_value(struct symbol *sym)
 				/* if the symbol is visible use the user value
 				 * if available, otherwise try the default value
 				 */
-				sym->flags |= SYMBOL_WRITE;
 				if (sym_has_value(sym)) {
 					newval.tri = EXPR_AND(sym->def[S_DEF_USER].tri,
 							      sym->visible);
@@ -433,12 +434,9 @@ void sym_calc_value(struct symbol *sym)
 	case S_STRING:
 	case S_HEX:
 	case S_INT:
-		if (sym->visible != no) {
-			sym->flags |= SYMBOL_WRITE;
-			if (sym_has_value(sym)) {
-				newval.val = sym->def[S_DEF_USER].val;
-				break;
-			}
+		if (sym->visible != no && sym_has_value(sym)) {
+			newval.val = sym->def[S_DEF_USER].val;
+			break;
 		}
 		prop = sym_get_default_prop(sym);
 		if (prop) {

From 4f208f392103e8d5bcc7558dba69348b27d8ce42 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 6 Feb 2018 09:34:43 +0900
Subject: [PATCH 430/676] kconfig: show '?' prompt even if no help text is
 available

'make config', 'make oldconfig', etc. always receive '?' as a valid
input and show useful information even if no help text is available.

------------------------>8------------------------
foo (FOO) [N/y] (NEW) ?

There is no help available for this option.
Symbol: FOO [=n]
Type  : bool
Prompt: foo
  Defined at Kconfig:1
------------------------>8------------------------

However, '?' is not shown in the prompt if its help text is missing.
Let's show '?' all the time so that the prompt and the behavior match.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/conf.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index 307bc3f7b945c..fc446acbf6d6f 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -199,9 +199,7 @@ static int conf_sym(struct menu *menu)
 			printf("/m");
 		if (oldval != yes && sym_tristate_within_range(sym, yes))
 			printf("/y");
-		if (menu_has_help(menu))
-			printf("/?");
-		printf("] ");
+		printf("/?] ");
 		if (!conf_askvalue(sym, sym_get_string_value(sym)))
 			return 0;
 		strip(line);
@@ -303,10 +301,7 @@ static int conf_choice(struct menu *menu)
 			printf("[1]: 1\n");
 			goto conf_childs;
 		}
-		printf("[1-%d", cnt);
-		if (menu_has_help(menu))
-			printf("?");
-		printf("]: ");
+		printf("[1-%d?]: ", cnt);
 		switch (input_mode) {
 		case oldconfig:
 		case silentoldconfig:

From cd58a91def2accba1a07960889e89a63d372119a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 6 Feb 2018 09:34:45 +0900
Subject: [PATCH 431/676] kconfig: remove 'config*' pattern from .gitignnore

I could not figure out why this pattern should be ignored.
Checking commit 1e65174a3378 ("Add some basic .gitignore files")
did not help.

Let's remove this pattern, then see if it is really needed.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/.gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/kconfig/.gitignore b/scripts/kconfig/.gitignore
index 51f1c877b543d..a76856e559c09 100644
--- a/scripts/kconfig/.gitignore
+++ b/scripts/kconfig/.gitignore
@@ -1,7 +1,6 @@
 #
 # Generated files
 #
-config*
 *.lex.c
 *.tab.c
 *.tab.h

From d2a04648a5dbc3d1d043b35257364f0197d4d868 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 8 Feb 2018 14:56:39 +0900
Subject: [PATCH 432/676] kconfig: remove check_stdin()

Except silentoldconfig, valid_stdin is 1, so check_stdin() is no-op.

oldconfig and silentoldconfig work almost in the same way except that
the latter generates additional files under include/.  Both ask users
for input for new symbols.

I do not know why only silentoldconfig requires stdio be tty.

  $ rm -f .config; touch .config
  $ yes "" | make oldconfig > stdout
  $ rm -f .config; touch .config
  $ yes "" | make silentoldconfig > stdout
  make[1]: *** [silentoldconfig] Error 1
  make: *** [silentoldconfig] Error 2
  $ tail -n 4 stdout
  Console input/output is redirected. Run 'make oldconfig' to update configuration.

  scripts/kconfig/Makefile:40: recipe for target 'silentoldconfig' failed
  Makefile:507: recipe for target 'silentoldconfig' failed

Redirection is useful, for example, for testing where we want to give
particular key inputs from a test file, then check the result.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/conf.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index fc446acbf6d6f..92111a0776805 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -39,7 +39,6 @@ static enum input_mode input_mode = oldaskconfig;
 
 static int indent = 1;
 static int tty_stdio;
-static int valid_stdin = 1;
 static int sync_kconfig;
 static int conf_cnt;
 static char line[PATH_MAX];
@@ -72,16 +71,6 @@ static void strip(char *str)
 		*p-- = 0;
 }
 
-static void check_stdin(void)
-{
-	if (!valid_stdin) {
-		printf(_("aborted!\n\n"));
-		printf(_("Console input/output is redirected. "));
-		printf(_("Run 'make oldconfig' to update configuration.\n\n"));
-		exit(1);
-	}
-}
-
 /* Helper function to facilitate fgets() by Jean Sacren. */
 static void xfgets(char *str, int size, FILE *in)
 {
@@ -113,7 +102,6 @@ static int conf_askvalue(struct symbol *sym, const char *def)
 			printf("%s\n", def);
 			return 0;
 		}
-		check_stdin();
 		/* fall through */
 	case oldaskconfig:
 		fflush(stdout);
@@ -310,7 +298,6 @@ static int conf_choice(struct menu *menu)
 				printf("%d\n", cnt);
 				break;
 			}
-			check_stdin();
 			/* fall through */
 		case oldaskconfig:
 			fflush(stdout);
@@ -645,7 +632,6 @@ int main(int ac, char **av)
 				return 1;
 			}
 		}
-		valid_stdin = tty_stdio;
 	}
 
 	switch (input_mode) {

From f3ff6fb5db68bcd460e9880d5fb4902520dd645b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 8 Feb 2018 14:56:40 +0900
Subject: [PATCH 433/676] kconfig: echo stdin to stdout if either is redirected

If stdio is not tty, conf_askvalue() puts additional new line to
prevent prompts from being concatenated into a single line.  This
care is missing in conf_choice(), so a 'choice' prompt and the next
prompt are shown in the same line.

Move the code into xfgets() to cater to all cases.  To improve this
more, let's echo stdin to stdout.  This clarifies what keys were
input from stdio and the stdout looks like as if it were from tty.

I removed the isatty(2) check since stderr is unrelated here.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/conf.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index 92111a0776805..fe59f6df4b45b 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -76,6 +76,9 @@ static void xfgets(char *str, int size, FILE *in)
 {
 	if (!fgets(str, size, in))
 		fprintf(stderr, "\nError in reading or end of file.\n");
+
+	if (!tty_stdio)
+		printf("%s", str);
 }
 
 static int conf_askvalue(struct symbol *sym, const char *def)
@@ -106,8 +109,6 @@ static int conf_askvalue(struct symbol *sym, const char *def)
 	case oldaskconfig:
 		fflush(stdout);
 		xfgets(line, sizeof(line), stdin);
-		if (!tty_stdio)
-			printf("\n");
 		return 1;
 	default:
 		break;
@@ -490,7 +491,7 @@ int main(int ac, char **av)
 	bindtextdomain(PACKAGE, LOCALEDIR);
 	textdomain(PACKAGE);
 
-	tty_stdio = isatty(0) && isatty(1) && isatty(2);
+	tty_stdio = isatty(0) && isatty(1);
 
 	while ((opt = getopt_long(ac, av, "s", long_opts, NULL)) != -1) {
 		if (opt == 's') {

From 9e3e10c725360b9d07018cfcd5b7b6b7d325fae5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 6 Feb 2018 09:34:41 +0900
Subject: [PATCH 434/676] kconfig: send error messages to stderr

These messages should be directed to stderr.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Ulf Magnusson <ulfalizer@gmail.com>
---
 scripts/kconfig/conf.c   | 10 ++++++----
 scripts/kconfig/expr.c   |  4 ++--
 scripts/kconfig/symbol.c |  2 +-
 scripts/kconfig/zconf.l  | 27 +++++++++++++++------------
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/scripts/kconfig/conf.c b/scripts/kconfig/conf.c
index fe59f6df4b45b..822dc51923d66 100644
--- a/scripts/kconfig/conf.c
+++ b/scripts/kconfig/conf.c
@@ -548,7 +548,7 @@ int main(int ac, char **av)
 		}
 	}
 	if (ac == optind) {
-		printf(_("%s: Kconfig file missing\n"), av[0]);
+		fprintf(stderr, _("%s: Kconfig file missing\n"), av[0]);
 		conf_usage(progname);
 		exit(1);
 	}
@@ -573,9 +573,11 @@ int main(int ac, char **av)
 		if (!defconfig_file)
 			defconfig_file = conf_get_default_confname();
 		if (conf_read(defconfig_file)) {
-			printf(_("***\n"
-				"*** Can't find default configuration \"%s\"!\n"
-				"***\n"), defconfig_file);
+			fprintf(stderr,
+				_("***\n"
+				  "*** Can't find default configuration \"%s\"!\n"
+				  "***\n"),
+				defconfig_file);
 			exit(1);
 		}
 		break;
diff --git a/scripts/kconfig/expr.c b/scripts/kconfig/expr.c
index 2ba332b3fed78..d45381986ac76 100644
--- a/scripts/kconfig/expr.c
+++ b/scripts/kconfig/expr.c
@@ -94,7 +94,7 @@ struct expr *expr_copy(const struct expr *org)
 		e->right.expr = expr_copy(org->right.expr);
 		break;
 	default:
-		printf("can't copy type %d\n", e->type);
+		fprintf(stderr, "can't copy type %d\n", e->type);
 		free(e);
 		e = NULL;
 		break;
@@ -127,7 +127,7 @@ void expr_free(struct expr *e)
 		expr_free(e->right.expr);
 		break;
 	default:
-		printf("how to free type %d?\n", e->type);
+		fprintf(stderr, "how to free type %d?\n", e->type);
 		break;
 	}
 	free(e);
diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index 13f7fdfe328dc..c4409ee7fee20 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -1221,7 +1221,7 @@ static struct symbol *sym_check_expr_deps(struct expr *e)
 	default:
 		break;
 	}
-	printf("Oops! How to check %d?\n", e->type);
+	fprintf(stderr, "Oops! How to check %d?\n", e->type);
 	return NULL;
 }
 
diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l
index 07e074dc68a17..0ba4900050c13 100644
--- a/scripts/kconfig/zconf.l
+++ b/scripts/kconfig/zconf.l
@@ -184,7 +184,9 @@ n	[A-Za-z0-9_-]
 			append_string(yytext, 1);
 	}
 	\n	{
-		printf("%s:%d:warning: multi-line strings not supported\n", zconf_curname(), zconf_lineno());
+		fprintf(stderr,
+			"%s:%d:warning: multi-line strings not supported\n",
+			zconf_curname(), zconf_lineno());
 		current_file->lineno++;
 		BEGIN(INITIAL);
 		return T_EOL;
@@ -294,7 +296,7 @@ void zconf_initscan(const char *name)
 {
 	yyin = zconf_fopen(name);
 	if (!yyin) {
-		printf("can't find file %s\n", name);
+		fprintf(stderr, "can't find file %s\n", name);
 		exit(1);
 	}
 
@@ -315,8 +317,8 @@ void zconf_nextfile(const char *name)
 	current_buf->state = YY_CURRENT_BUFFER;
 	yyin = zconf_fopen(file->name);
 	if (!yyin) {
-		printf("%s:%d: can't open file \"%s\"\n",
-		    zconf_curname(), zconf_lineno(), file->name);
+		fprintf(stderr, "%s:%d: can't open file \"%s\"\n",
+			zconf_curname(), zconf_lineno(), file->name);
 		exit(1);
 	}
 	yy_switch_to_buffer(yy_create_buffer(yyin, YY_BUF_SIZE));
@@ -325,20 +327,21 @@ void zconf_nextfile(const char *name)
 
 	for (iter = current_file->parent; iter; iter = iter->parent ) {
 		if (!strcmp(current_file->name,iter->name) ) {
-			printf("%s:%d: recursive inclusion detected. "
-			       "Inclusion path:\n  current file : '%s'\n",
-			       zconf_curname(), zconf_lineno(),
-			       zconf_curname());
+			fprintf(stderr,
+				"%s:%d: recursive inclusion detected. "
+				"Inclusion path:\n  current file : '%s'\n",
+				zconf_curname(), zconf_lineno(),
+				zconf_curname());
 			iter = current_file->parent;
 			while (iter && \
 			       strcmp(iter->name,current_file->name)) {
-				printf("  included from: '%s:%d'\n",
-				       iter->name, iter->lineno-1);
+				fprintf(stderr, "  included from: '%s:%d'\n",
+					iter->name, iter->lineno-1);
 				iter = iter->parent;
 			}
 			if (iter)
-				printf("  included from: '%s:%d'\n",
-				       iter->name, iter->lineno+1);
+				fprintf(stderr, "  included from: '%s:%d'\n",
+					iter->name, iter->lineno+1);
 			exit(1);
 		}
 	}

From 762c330d670e3d4b795cf7a8d761866fdd1eef49 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 7 Feb 2018 17:14:46 +0800
Subject: [PATCH 435/676] tuntap: add missing xdp flush

When using devmap to redirect packets between interfaces,
xdp_do_flush() is usually a must to flush any batched
packets. Unfortunately this is missed in current tuntap
implementation.

Unlike most hardware driver which did XDP inside NAPI loop and call
xdp_do_flush() at then end of each round of poll. TAP did it in the
context of process e.g tun_get_user(). So fix this by count the
pending redirected packets and flush when it exceeds NAPI_POLL_WEIGHT
or MSG_MORE was cleared by sendmsg() caller.

With this fix, xdp_redirect_map works again between two TAPs.

Fixes: 761876c857cb ("tap: XDP support")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0dc66e4fbb2c6..17e496b88f812 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -181,6 +181,7 @@ struct tun_file {
 	struct tun_struct *detached;
 	struct ptr_ring tx_ring;
 	struct xdp_rxq_info xdp_rxq;
+	int xdp_pending_pkts;
 };
 
 struct tun_flow_entry {
@@ -1665,6 +1666,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		case XDP_REDIRECT:
 			get_page(alloc_frag->page);
 			alloc_frag->offset += buflen;
+			++tfile->xdp_pending_pkts;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			if (err)
 				goto err_redirect;
@@ -1986,6 +1988,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	result = tun_get_user(tun, tfile, NULL, from,
 			      file->f_flags & O_NONBLOCK, false);
 
+	if (tfile->xdp_pending_pkts) {
+		tfile->xdp_pending_pkts = 0;
+		xdp_do_flush_map();
+	}
+
 	tun_put(tun);
 	return result;
 }
@@ -2322,6 +2329,13 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
 			   m->msg_flags & MSG_DONTWAIT,
 			   m->msg_flags & MSG_MORE);
+
+	if (tfile->xdp_pending_pkts >= NAPI_POLL_WEIGHT ||
+	    !(m->msg_flags & MSG_MORE)) {
+		tfile->xdp_pending_pkts = 0;
+		xdp_do_flush_map();
+	}
+
 	tun_put(tun);
 	return ret;
 }
@@ -3153,6 +3167,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
 	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
+	tfile->xdp_pending_pkts = 0;
 
 	return 0;
 }

From 4ff66cae7f10b65b028dc3bdaaad9cc2989ef6ae Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 7 Feb 2018 13:53:20 +0100
Subject: [PATCH 436/676] rtnetlink: require unique netns identifier

Since we've added support for IFLA_IF_NETNSID for RTM_{DEL,GET,SET,NEW}LINK
it is possible for userspace to send us requests with three different
properties to identify a target network namespace. This affects at least
RTM_{NEW,SET}LINK. Each of them could potentially refer to a different
network namespace which is confusing. For legacy reasons the kernel will
pick the IFLA_NET_NS_PID property first and then look for the
IFLA_NET_NS_FD property but there is no reason to extend this type of
behavior to network namespace ids. The regression potential is quite
minimal since the rtnetlink requests in question either won't allow
IFLA_IF_NETNSID requests before 4.16 is out (RTM_{NEW,SET}LINK) or don't
support IFLA_NET_NS_{PID,FD} (RTM_{DEL,GET}LINK) in the first place.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56af8e41abfcb..bc290413a49d5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1951,6 +1951,38 @@ static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
 	return net;
 }
 
+/* Verify that rtnetlink requests do not pass additional properties
+ * potentially referring to different network namespaces.
+ */
+static int rtnl_ensure_unique_netns(struct nlattr *tb[],
+				    struct netlink_ext_ack *extack,
+				    bool netns_id_only)
+{
+
+	if (netns_id_only) {
+		if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
+			return 0;
+
+		NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+		goto invalid_attr;
+
+	if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+		goto invalid_attr;
+
+	if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+		goto invalid_attr;
+
+	return 0;
+
+invalid_attr:
+	NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
+	return -EINVAL;
+}
+
 static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 {
 	if (dev) {
@@ -2553,6 +2585,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
+	err = rtnl_ensure_unique_netns(tb, extack, false);
+	if (err < 0)
+		goto errout;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -2649,6 +2685,10 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, true);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 
@@ -2802,6 +2842,10 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, false);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IFNAME])
 		nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
 	else
@@ -3045,6 +3089,10 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		return err;
 
+	err = rtnl_ensure_unique_netns(tb, extack, true);
+	if (err < 0)
+		return err;
+
 	if (tb[IFLA_IF_NETNSID]) {
 		netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
 		tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);

From 583133b35ecaea84590e92e2732d1a472747ca7d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 7 Feb 2018 10:17:29 -0600
Subject: [PATCH 437/676] atm: he: use 64-bit arithmetic instead of 32-bit

Add suffix ULL to constants 272, 204, 136 and 68 in order to give the
compiler complete information about the proper arithmetic to use.
Notice that these constants are used in contexts that expect
expressions of type unsigned long long (64 bits, unsigned).

The following expressions are currently being evaluated using 32-bit
arithmetic:

272 * mult
204 * mult
136 * mult
68 * mult

Addresses-Coverity-ID: 201058
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/atm/he.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index e58538c293777..29f102dcfec49 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -738,13 +738,13 @@ static int he_init_cs_block_rcm(struct he_dev *he_dev)
 #else
 		/* this is pretty, but avoids _divdu3 and is mostly correct */
 		mult = he_dev->atm_dev->link_rate / ATM_OC3_PCR;
-		if (rate_cps > (272 * mult))
+		if (rate_cps > (272ULL * mult))
 			buf = 4;
-		else if (rate_cps > (204 * mult))
+		else if (rate_cps > (204ULL * mult))
 			buf = 3;
-		else if (rate_cps > (136 * mult))
+		else if (rate_cps > (136ULL * mult))
 			buf = 2;
-		else if (rate_cps > (68 * mult))
+		else if (rate_cps > (68ULL * mult))
 			buf = 1;
 		else
 			buf = 0;

From ec95dffa408f0c24c0b358f3723c6ba262190965 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 7 Feb 2018 13:00:24 -0600
Subject: [PATCH 438/676] ibmvnic: queue reset when CRQ gets closed during
 reset

While handling a driver reset we get a H_CLOSED return trying
to send a CRQ event. When this occurs we need to queue up another
reset attempt. Without doing this we see instances where the driver
is left in a closed state because the reset failed and there is no
further attempts to reset the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 8dabc9d9dfa60..dd4a2946e1da1 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2914,8 +2914,12 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter *adapter,
 				cpu_to_be64(u64_crq[1]));
 
 	if (rc) {
-		if (rc == H_CLOSED)
+		if (rc == H_CLOSED) {
 			dev_warn(dev, "CRQ Queue closed\n");
+			if (adapter->resetting)
+				ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+		}
+
 		dev_warn(dev, "Send error (rc=%d)\n", rc);
 	}
 

From e728789c52afccc1275cba1dd812f03abe16ea3c Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Wed, 7 Feb 2018 20:35:00 +0100
Subject: [PATCH 439/676] net: Extra '_get' in declaration of
 arch_get_platform_mac_address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit c7f5d105495a ("net: Add eth_platform_get_mac_address() helper."),
two declarations were added:

  int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
  unsigned char *arch_get_platform_get_mac_address(void);

An extra '_get' was introduced in arch_get_platform_get_mac_address, remove
it. Fix compile warning using W=1:

  CC      net/ethernet/eth.o
net/ethernet/eth.c:523:24: warning: no previous prototype for ‘arch_get_platform_mac_address’ [-Wmissing-prototypes]
 unsigned char * __weak arch_get_platform_mac_address(void)
                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  AR      net/ethernet/built-in.o

Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 263dbcad22fca..79563840c295c 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -31,7 +31,7 @@
 #ifdef __KERNEL__
 struct device;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
-unsigned char *arch_get_platform_get_mac_address(void);
+unsigned char *arch_get_platform_mac_address(void);
 u32 eth_get_headlen(void *data, unsigned int max_len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;

From 79a8a642bf05cd0dced20621f6fef9d884124abd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Feb 2018 17:44:38 -0800
Subject: [PATCH 440/676] net: Whitelist the skbuff_head_cache "cb" field

Most callers of put_cmsg() use a "sizeof(foo)" for the length argument.
Within put_cmsg(), a copy_to_user() call is made with a dynamic size, as a
result of the cmsg header calculations. This means that hardened usercopy
will examine the copy, even though it was technically a fixed size and
should be implicitly whitelisted. All the put_cmsg() calls being built
from values in skbuff_head_cache are coming out of the protocol-defined
"cb" field, so whitelist this field entirely instead of creating per-use
bounce buffers, for which there are concerns about performance.

Original report was:

Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'skbuff_head_cache' (offset 64, size 16)!
WARNING: CPU: 0 PID: 3663 at mm/usercopy.c:81 usercopy_warn+0xdb/0x100 mm/usercopy.c:76
...
 __check_heap_object+0x89/0xc0 mm/slab.c:4426
 check_heap_object mm/usercopy.c:236 [inline]
 __check_object_size+0x272/0x530 mm/usercopy.c:259
 check_object_size include/linux/thread_info.h:112 [inline]
 check_copy_size include/linux/thread_info.h:143 [inline]
 copy_to_user include/linux/uaccess.h:154 [inline]
 put_cmsg+0x233/0x3f0 net/core/scm.c:242
 sock_recv_errqueue+0x200/0x3e0 net/core/sock.c:2913
 packet_recvmsg+0xb2e/0x17a0 net/packet/af_packet.c:3296
 sock_recvmsg_nosec net/socket.c:803 [inline]
 sock_recvmsg+0xc9/0x110 net/socket.c:810
 ___sys_recvmsg+0x2a4/0x640 net/socket.c:2179
 __sys_recvmmsg+0x2a9/0xaf0 net/socket.c:2287
 SYSC_recvmmsg net/socket.c:2368 [inline]
 SyS_recvmmsg+0xc4/0x160 net/socket.c:2352
 entry_SYSCALL_64_fastpath+0x29/0xa0

Reported-by: syzbot+e2d6cfb305e9f3911dea@syzkaller.appspotmail.com
Fixes: 6d07d1cd300f ("usercopy: Restrict non-usercopy caches to size 0")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8c61c27c1b288..09bd89c90a71c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3894,10 +3894,12 @@ EXPORT_SYMBOL_GPL(skb_gro_receive);
 
 void __init skb_init(void)
 {
-	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
+	skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
 					      sizeof(struct sk_buff),
 					      0,
 					      SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					      offsetof(struct sk_buff, cb),
+					      sizeof_field(struct sk_buff, cb),
 					      NULL);
 	skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
 						sizeof(struct sk_buff_fclones),

From ebeeb1ad9b8adcc37c2ec21a96f39e9d35199b46 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Sat, 3 Feb 2018 04:26:51 -0800
Subject: [PATCH 441/676] rds: tcp: use rds_destroy_pending() to synchronize
 netns/module teardown and rds connection/workq management

An rds_connection can get added during netns deletion between lines 528
and 529 of

  506 static void rds_tcp_kill_sock(struct net *net)
  :
  /* code to pull out all the rds_connections that should be destroyed */
  :
  528         spin_unlock_irq(&rds_tcp_conn_lock);
  529         list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
  530                 rds_conn_destroy(tc->t_cpath->cp_conn);

Such an rds_connection would miss out the rds_conn_destroy()
loop (that cancels all pending work) and (if it was scheduled
after netns deletion) could trigger the use-after-free.

A similar race-window exists for the module unload path
in rds_tcp_exit -> rds_tcp_destroy_conns

Concurrency with netns deletion (rds_tcp_kill_sock()) must be handled
by checking check_net() before enqueuing new work or adding new
connections.

Concurrency with module-unload is handled by maintaining a module
specific flag that is set at the start of the module exit function,
and must be checked before enqueuing new work or adding new connections.

This commit refactors existing RDS_DESTROY_PENDING checks added by
commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with
connection teardown") and consolidates all the concurrency checks
listed above into the function rds_destroy_pending().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/cong.c        |  2 +-
 net/rds/connection.c  | 15 +++++++++------
 net/rds/ib.c          | 17 +++++++++++++++++
 net/rds/ib_cm.c       |  1 +
 net/rds/rds.h         |  7 +++++++
 net/rds/send.c        | 10 +++++-----
 net/rds/tcp.c         | 42 ++++++++++++++++++++++++++++++------------
 net/rds/tcp_connect.c |  2 +-
 net/rds/tcp_recv.c    |  2 +-
 net/rds/tcp_send.c    |  2 +-
 net/rds/threads.c     |  6 +++---
 11 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/net/rds/cong.c b/net/rds/cong.c
index 8d19fd25dce36..63da9d2f142d1 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -223,7 +223,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
 
 		rcu_read_lock();
 		if (!test_and_set_bit(0, &conn->c_map_queued) &&
-		    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+		    !rds_destroy_pending(cp->cp_conn)) {
 			rds_stats_inc(s_cong_update_queued);
 			/* We cannot inline the call to rds_send_xmit() here
 			 * for two reasons (both pertaining to a TCP transport):
diff --git a/net/rds/connection.c b/net/rds/connection.c
index b10c0ef36d8d4..94e190febfddd 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -220,8 +220,13 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 				     is_outgoing);
 		conn->c_path[i].cp_index = i;
 	}
-	ret = trans->conn_alloc(conn, gfp);
+	rcu_read_lock();
+	if (rds_destroy_pending(conn))
+		ret = -ENETDOWN;
+	else
+		ret = trans->conn_alloc(conn, gfp);
 	if (ret) {
+		rcu_read_unlock();
 		kfree(conn->c_path);
 		kmem_cache_free(rds_conn_slab, conn);
 		conn = ERR_PTR(ret);
@@ -283,6 +288,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		}
 	}
 	spin_unlock_irqrestore(&rds_conn_lock, flags);
+	rcu_read_unlock();
 
 out:
 	return conn;
@@ -382,13 +388,10 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
 {
 	struct rds_message *rm, *rtmp;
 
-	set_bit(RDS_DESTROY_PENDING, &cp->cp_flags);
-
 	if (!cp->cp_transport_data)
 		return;
 
 	/* make sure lingering queued work won't try to ref the conn */
-	synchronize_rcu();
 	cancel_delayed_work_sync(&cp->cp_send_w);
 	cancel_delayed_work_sync(&cp->cp_recv_w);
 
@@ -691,7 +694,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
 	atomic_set(&cp->cp_state, RDS_CONN_ERROR);
 
 	rcu_read_lock();
-	if (!destroy && test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (!destroy && rds_destroy_pending(cp->cp_conn)) {
 		rcu_read_unlock();
 		return;
 	}
@@ -714,7 +717,7 @@ EXPORT_SYMBOL_GPL(rds_conn_drop);
 void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
 {
 	rcu_read_lock();
-	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (rds_destroy_pending(cp->cp_conn)) {
 		rcu_read_unlock();
 		return;
 	}
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ff0c98096af1c..50a88f3e7e393 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -48,6 +48,7 @@
 static unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
 static unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
+static atomic_t rds_ib_unloading;
 
 module_param(rds_ib_mr_1m_pool_size, int, 0444);
 MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
@@ -378,8 +379,23 @@ static void rds_ib_unregister_client(void)
 	flush_workqueue(rds_wq);
 }
 
+static void rds_ib_set_unloading(void)
+{
+	atomic_set(&rds_ib_unloading, 1);
+}
+
+static bool rds_ib_is_unloading(struct rds_connection *conn)
+{
+	struct rds_conn_path *cp = &conn->c_path[0];
+
+	return (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags) ||
+		atomic_read(&rds_ib_unloading) != 0);
+}
+
 void rds_ib_exit(void)
 {
+	rds_ib_set_unloading();
+	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 	rds_ib_unregister_client();
 	rds_ib_destroy_nodev_conns();
@@ -413,6 +429,7 @@ struct rds_transport rds_ib_transport = {
 	.flush_mrs		= rds_ib_flush_mrs,
 	.t_owner		= THIS_MODULE,
 	.t_name			= "infiniband",
+	.t_unloading		= rds_ib_is_unloading,
 	.t_type			= RDS_TRANS_IB
 };
 
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 80fb6f63e768d..eea1d8611b205 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -117,6 +117,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version));
+		set_bit(RDS_DESTROY_PENDING, &conn->c_path[0].cp_flags);
 		rds_conn_destroy(conn);
 		return;
 	} else {
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 374ae83b60d48..7301b9b01890e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -518,6 +518,7 @@ struct rds_transport {
 	void (*sync_mr)(void *trans_private, int direction);
 	void (*free_mr)(void *trans_private, int invalidate);
 	void (*flush_mrs)(void);
+	bool (*t_unloading)(struct rds_connection *conn);
 };
 
 struct rds_sock {
@@ -862,6 +863,12 @@ static inline void rds_mr_put(struct rds_mr *mr)
 		__rds_put_mr_final(mr);
 }
 
+static inline bool rds_destroy_pending(struct rds_connection *conn)
+{
+	return !check_net(rds_conn_net(conn)) ||
+	       (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
+}
+
 /* stats.c */
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {		\
diff --git a/net/rds/send.c b/net/rds/send.c
index d3e32d1f3c7d6..b1b0022b8370b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -162,7 +162,7 @@ int rds_send_xmit(struct rds_conn_path *cp)
 		goto out;
 	}
 
-	if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (rds_destroy_pending(cp->cp_conn)) {
 		release_in_xmit(cp);
 		ret = -ENETUNREACH; /* dont requeue send work */
 		goto out;
@@ -444,7 +444,7 @@ int rds_send_xmit(struct rds_conn_path *cp)
 			if (batch_count < send_batch_count)
 				goto restart;
 			rcu_read_lock();
-			if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+			if (rds_destroy_pending(cp->cp_conn))
 				ret = -ENETUNREACH;
 			else
 				queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
@@ -1162,7 +1162,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	else
 		cpath = &conn->c_path[0];
 
-	if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags)) {
+	if (rds_destroy_pending(conn)) {
 		ret = -EAGAIN;
 		goto out;
 	}
@@ -1209,7 +1209,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	if (ret == -ENOMEM || ret == -EAGAIN) {
 		ret = 0;
 		rcu_read_lock();
-		if (test_bit(RDS_DESTROY_PENDING, &cpath->cp_flags))
+		if (rds_destroy_pending(cpath->cp_conn))
 			ret = -ENETUNREACH;
 		else
 			queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
@@ -1295,7 +1295,7 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
 
 	/* schedule the send work on rds_wq */
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	if (!rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
 	rcu_read_unlock();
 
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9920d2f84eff8..44c4652721af2 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -49,6 +49,7 @@ static unsigned int rds_tcp_tc_count;
 /* Track rds_tcp_connection structs so they can be cleaned up */
 static DEFINE_SPINLOCK(rds_tcp_conn_lock);
 static LIST_HEAD(rds_tcp_conn_list);
+static atomic_t rds_tcp_unloading = ATOMIC_INIT(0);
 
 static struct kmem_cache *rds_tcp_conn_slab;
 
@@ -274,14 +275,13 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
 static void rds_tcp_conn_free(void *arg)
 {
 	struct rds_tcp_connection *tc = arg;
-	unsigned long flags;
 
 	rdsdebug("freeing tc %p\n", tc);
 
-	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	if (!tc->t_tcp_node_detached)
 		list_del(&tc->t_tcp_node);
-	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 
 	kmem_cache_free(rds_tcp_conn_slab, tc);
 }
@@ -296,7 +296,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
 		if (!tc) {
 			ret = -ENOMEM;
-			break;
+			goto fail;
 		}
 		mutex_init(&tc->t_conn_path_lock);
 		tc->t_sock = NULL;
@@ -306,14 +306,19 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 
 		conn->c_path[i].cp_transport_data = tc;
 		tc->t_cpath = &conn->c_path[i];
+		tc->t_tcp_node_detached = true;
 
-		spin_lock_irq(&rds_tcp_conn_lock);
-		tc->t_tcp_node_detached = false;
-		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
-		spin_unlock_irq(&rds_tcp_conn_lock);
 		rdsdebug("rds_conn_path [%d] tc %p\n", i,
 			 conn->c_path[i].cp_transport_data);
 	}
+	spin_lock_bh(&rds_tcp_conn_lock);
+	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
+		tc = conn->c_path[i].cp_transport_data;
+		tc->t_tcp_node_detached = false;
+		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
+	}
+	spin_unlock_bh(&rds_tcp_conn_lock);
+fail:
 	if (ret) {
 		for (j = 0; j < i; j++)
 			rds_tcp_conn_free(conn->c_path[j].cp_transport_data);
@@ -332,6 +337,16 @@ static bool list_has_conn(struct list_head *list, struct rds_connection *conn)
 	return false;
 }
 
+static void rds_tcp_set_unloading(void)
+{
+	atomic_set(&rds_tcp_unloading, 1);
+}
+
+static bool rds_tcp_is_unloading(struct rds_connection *conn)
+{
+	return atomic_read(&rds_tcp_unloading) != 0;
+}
+
 static void rds_tcp_destroy_conns(void)
 {
 	struct rds_tcp_connection *tc, *_tc;
@@ -370,6 +385,7 @@ struct rds_transport rds_tcp_transport = {
 	.t_type			= RDS_TRANS_TCP,
 	.t_prefer_loopback	= 1,
 	.t_mp_capable		= 1,
+	.t_unloading		= rds_tcp_is_unloading,
 };
 
 static unsigned int rds_tcp_netid;
@@ -513,7 +529,7 @@ static void rds_tcp_kill_sock(struct net *net)
 
 	rtn->rds_tcp_listen_sock = NULL;
 	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
-	spin_lock_irq(&rds_tcp_conn_lock);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -526,7 +542,7 @@ static void rds_tcp_kill_sock(struct net *net)
 			tc->t_tcp_node_detached = true;
 		}
 	}
-	spin_unlock_irq(&rds_tcp_conn_lock);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
 }
@@ -574,7 +590,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
 
-	spin_lock_irq(&rds_tcp_conn_lock);
+	spin_lock_bh(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -584,7 +600,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 		/* reconnect with new parameters */
 		rds_conn_path_drop(tc->t_cpath, false);
 	}
-	spin_unlock_irq(&rds_tcp_conn_lock);
+	spin_unlock_bh(&rds_tcp_conn_lock);
 }
 
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
@@ -607,6 +623,8 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
 
 static void rds_tcp_exit(void)
 {
+	rds_tcp_set_unloading();
+	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 	unregister_pernet_subsys(&rds_tcp_net_ops);
 	if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 534c67aeb20f8..d999e70756457 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -170,7 +170,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
 		 cp->cp_conn, tc, sock);
 
 	if (sock) {
-		if (test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (rds_destroy_pending(cp->cp_conn))
 			rds_tcp_set_linger(sock);
 		sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
 		lock_sock(sock->sk);
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index dd707b9e73e57..b9fbd2ee74efe 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -323,7 +323,7 @@ void rds_tcp_data_ready(struct sock *sk)
 
 	if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
 		rcu_read_lock();
-		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (!rds_destroy_pending(cp->cp_conn))
 			queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
 		rcu_read_unlock();
 	}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 16f65744d9844..7df869d37afd4 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -204,7 +204,7 @@ void rds_tcp_write_space(struct sock *sk)
 
 	rcu_read_lock();
 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
-	    !test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	    !rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
 	rcu_read_unlock();
 
diff --git a/net/rds/threads.c b/net/rds/threads.c
index eb76db1360b00..c52861d77a596 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -88,7 +88,7 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
 	cp->cp_reconnect_jiffies = 0;
 	set_bit(0, &cp->cp_conn->c_map_queued);
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags)) {
+	if (!rds_destroy_pending(cp->cp_conn)) {
 		queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
 		queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
 	}
@@ -138,7 +138,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	if (cp->cp_reconnect_jiffies == 0) {
 		cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
 		rcu_read_lock();
-		if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+		if (!rds_destroy_pending(cp->cp_conn))
 			queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
 		rcu_read_unlock();
 		return;
@@ -149,7 +149,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
 		 conn, &conn->c_laddr, &conn->c_faddr);
 	rcu_read_lock();
-	if (!test_bit(RDS_DESTROY_PENDING, &cp->cp_flags))
+	if (!rds_destroy_pending(cp->cp_conn))
 		queue_delayed_work(rds_wq, &cp->cp_conn_w,
 				   rand % cp->cp_reconnect_jiffies);
 	rcu_read_unlock();

From 3968523f855050b8195134da951b87c20bd66130 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 7 Feb 2018 22:34:24 -0800
Subject: [PATCH 442/676] mpls, nospec: Sanitize array index in mpls_label_ok()

mpls_label_ok() validates that the 'platform_label' array index from a
userspace netlink message payload is valid. Under speculation the
mpls_label_ok() result may not resolve in the CPU pipeline until after
the index is used to access an array element. Sanitize the index to zero
to prevent userspace-controlled arbitrary out-of-bounds speculation, a
precursor for a speculative execution side channel vulnerability.

Cc: <stable@vger.kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 5dce8336d33f7..e545a3c9365f8 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -8,6 +8,7 @@
 #include <linux/ipv6.h>
 #include <linux/mpls.h>
 #include <linux/netconf.h>
+#include <linux/nospec.h>
 #include <linux/vmalloc.h>
 #include <linux/percpu.h>
 #include <net/ip.h>
@@ -935,24 +936,27 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
 	return err;
 }
 
-static bool mpls_label_ok(struct net *net, unsigned int index,
+static bool mpls_label_ok(struct net *net, unsigned int *index,
 			  struct netlink_ext_ack *extack)
 {
+	bool is_ok = true;
+
 	/* Reserved labels may not be set */
-	if (index < MPLS_LABEL_FIRST_UNRESERVED) {
+	if (*index < MPLS_LABEL_FIRST_UNRESERVED) {
 		NL_SET_ERR_MSG(extack,
 			       "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher");
-		return false;
+		is_ok = false;
 	}
 
 	/* The full 20 bit range may not be supported. */
-	if (index >= net->mpls.platform_labels) {
+	if (is_ok && *index >= net->mpls.platform_labels) {
 		NL_SET_ERR_MSG(extack,
 			       "Label >= configured maximum in platform_labels");
-		return false;
+		is_ok = false;
 	}
 
-	return true;
+	*index = array_index_nospec(*index, net->mpls.platform_labels);
+	return is_ok;
 }
 
 static int mpls_route_add(struct mpls_route_config *cfg,
@@ -975,7 +979,7 @@ static int mpls_route_add(struct mpls_route_config *cfg,
 		index = find_free_label(net);
 	}
 
-	if (!mpls_label_ok(net, index, extack))
+	if (!mpls_label_ok(net, &index, extack))
 		goto errout;
 
 	/* Append makes no sense with mpls */
@@ -1052,7 +1056,7 @@ static int mpls_route_del(struct mpls_route_config *cfg,
 
 	index = cfg->rc_label;
 
-	if (!mpls_label_ok(net, index, extack))
+	if (!mpls_label_ok(net, &index, extack))
 		goto errout;
 
 	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
@@ -1810,7 +1814,7 @@ static int rtm_to_route_config(struct sk_buff *skb,
 				goto errout;
 
 			if (!mpls_label_ok(cfg->rc_nlinfo.nl_net,
-					   cfg->rc_label, extack))
+					   &cfg->rc_label, extack))
 				goto errout;
 			break;
 		}
@@ -2137,7 +2141,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
 			goto errout;
 		}
 
-		if (!mpls_label_ok(net, in_label, extack)) {
+		if (!mpls_label_ok(net, &in_label, extack)) {
 			err = -EINVAL;
 			goto errout;
 		}

From eb53f7af6f15285e2f6ada97285395343ce9f433 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Thu, 8 Feb 2018 16:10:39 +0100
Subject: [PATCH 443/676] net/sched: cls_u32: fix cls_u32 on filter replace

The following sequence is currently broken:

 # tc qdisc add dev foo ingress
 # tc filter replace dev foo protocol all ingress \
   u32 match u8 0 0 action mirred egress mirror dev bar1
 # tc filter replace dev foo protocol all ingress \
   handle 800::800 pref 49152 \
   u32 match u8 0 0 action mirred egress mirror dev bar2
 Error: cls_u32: Key node flags do not match passed flags.
 We have an error talking to the kernel, -1

The error comes from u32_change() when comparing new and
existing flags. The existing ones always contains one of
TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state.
These flags cannot be passed from userspace so the condition
(n->flags != flags) in u32_change() always fails.

Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and
TCA_CLS_FLAGS_IN_HW are not taken into account.

Fixes: 24d3dc6d27ea ("net/sched: cls_u32: Reflect HW offload status")
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_u32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 6311a548046b5..c75e68e839c7e 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -955,7 +955,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -EINVAL;
 		}
 
-		if (n->flags != flags) {
+		if ((n->flags ^ flags) &
+		    ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) {
 			NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags");
 			return -EINVAL;
 		}

From 55b3280d1e471795c08dbbe17325720a843e104c Mon Sep 17 00:00:00 2001
From: Hoang Le <hoang.h.le@dektek.com.au>
Date: Thu, 8 Feb 2018 17:16:25 +0100
Subject: [PATCH 444/676] tipc: fix skb truesize/datasize ratio control

In commit d618d09a68e4 ("tipc: enforce valid ratio between skb truesize
and contents") we introduced a test for ensuring that the condition
truesize/datasize <= 4 is true for a received buffer. Unfortunately this
test has two problems.

- Because of the integer arithmetics the test
  if (skb->truesize / buf_roundup_len(skb) > 4) will miss all
  ratios [4 < ratio < 5], which was not the intention.
- The buffer returned by skb_copy() inherits skb->truesize of the
  original buffer, which doesn't help the situation at all.

In this commit, we change the ratio condition and replace skb_copy()
with a call to skb_copy_expand() to finally get this right.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/msg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 55d8ba92291d2..4e1c6f6450bb9 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -208,8 +208,8 @@ bool tipc_msg_validate(struct sk_buff **_skb)
 	int msz, hsz;
 
 	/* Ensure that flow control ratio condition is satisfied */
-	if (unlikely(skb->truesize / buf_roundup_len(skb) > 4)) {
-		skb = skb_copy(skb, GFP_ATOMIC);
+	if (unlikely(skb->truesize / buf_roundup_len(skb) >= 4)) {
+		skb = skb_copy_expand(skb, BUF_HEADROOM, 0, GFP_ATOMIC);
 		if (!skb)
 			return false;
 		kfree_skb(*_skb);

From 88c991a91729b402bbfbf247fdba16ac21c369ab Mon Sep 17 00:00:00 2001
From: Dean Nelson <dnelson@redhat.com>
Date: Thu, 8 Feb 2018 14:21:05 -0500
Subject: [PATCH 445/676] net: thunder: change q_len's type to handle max ring
 size

The Cavium thunder nicvf driver supports rx/tx rings of up to 65536 entries per.
The number of entires are stored in the q_len member of struct q_desc_mem. The
problem is that q_len being a u16, results in 65536 becoming 0.

In getting pointers to descriptors in the rings, the driver uses q_len minus 1
as a mask after incrementing the pointer, in order to go back to the beginning
and not go past the end of the ring.

With the q_len set to 0 the mask is no longer correct and the driver does go
beyond the end of the ring, causing various ills. Usually the first thing that
shows up is a "NETDEV WATCHDOG: enP2p1s0f1 (nicvf): transmit queue 7 timed out"
warning.

This patch remedies the problem by changing q_len to a u32.

Signed-off-by: Dean Nelson <dnelson@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 7d1e4e2aaad0c..ce1eed7a6d63b 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -213,7 +213,7 @@ struct rx_tx_queue_stats {
 struct q_desc_mem {
 	dma_addr_t	dma;
 	u64		size;
-	u16		q_len;
+	u32		q_len;
 	dma_addr_t	phys_base;
 	void		*base;
 	void		*unalign_base;

From 08f5138512180a479ce6b9d23b825c9f4cd3be77 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 8 Feb 2018 21:01:48 +0100
Subject: [PATCH 446/676] net: phy: fix phy_start to consider
 PHY_IGNORE_INTERRUPT

This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added
long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates
also PHY state changes and we should do what the symbol says.

Fixes: 84a527a41f38 ("net: phylib: fix interrupts re-enablement in phy_start")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index f3313a1295312..e3e29c2b028b5 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -822,7 +822,7 @@ void phy_start(struct phy_device *phydev)
 		phy_resume(phydev);
 
 		/* make sure interrupts are re-enabled for the PHY */
-		if (phydev->irq != PHY_POLL) {
+		if (phy_interrupt_is_valid(phydev)) {
 			err = phy_enable_interrupts(phydev);
 			if (err < 0)
 				break;

From f515f86b34b2e7d4b24cc9b7375c9e749895088e Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <aglo@umich.edu>
Date: Thu, 29 Jun 2017 09:25:36 -0400
Subject: [PATCH 447/676] fix parallelism for rpc tasks

Hi folks,

On a multi-core machine, is it expected that we can have parallel RPCs
handled by each of the per-core workqueue?

In testing a read workload, observing via "top" command that a single
"kworker" thread is running servicing the requests (no parallelism).
It's more prominent while doing these operations over krb5p mount.

What has been suggested by Bruce is to try this and in my testing I
see then the read workload spread among all the kworker threads.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 25e6051e97f29..d9db2eab3a8df 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1104,7 +1104,7 @@ static int rpciod_start(void)
 	 * Create the rpciod thread and wait for it to start.
 	 */
 	dprintk("RPC:       creating workqueue rpciod\n");
-	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0);
+	wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
 	if (!wq)
 		goto out_failed;
 	rpciod_workqueue = wq;

From 6e59de2048eb375a9bfcd39461ef841cd2a78962 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Thu, 8 Feb 2018 17:46:01 +0800
Subject: [PATCH 448/676] drm/amdgpu: add new device to use atpx quirk

The affected system (0x0813) is pretty similar to another one (0x0812),
it also needs to use ATPX power control.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
index e2c3c5ec42d15..c53095b3b0fb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c
@@ -568,6 +568,7 @@ static const struct amdgpu_px_quirk amdgpu_px_quirk_list[] = {
 	/* HG _PR3 doesn't seem to work on this A+A weston board */
 	{ 0x1002, 0x6900, 0x1002, 0x0124, AMDGPU_PX_QUIRK_FORCE_ATPX },
 	{ 0x1002, 0x6900, 0x1028, 0x0812, AMDGPU_PX_QUIRK_FORCE_ATPX },
+	{ 0x1002, 0x6900, 0x1028, 0x0813, AMDGPU_PX_QUIRK_FORCE_ATPX },
 	{ 0, 0, 0, 0, 0 },
 };
 

From 8c88181ed452707f3815827225517b1964463b95 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:12 +0100
Subject: [PATCH 449/676] bpf: Sync kernel ABI header with tooling header for
 bpf_common.h

I recently fixed up a lot of commits that forgot to keep the tooling
headers in sync.  And then I forgot to do the same thing in commit
cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes"). Let correct
that before people notice ;-).

Lawrence did partly fix/sync this for bpf.h in commit d6d4f60c3a09
("bpf: add selftest for tcpbpf").

Fixes: cb5f7334d479 ("bpf: add comments to BPF ld/ldx sizes")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/include/uapi/linux/bpf_common.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/bpf_common.h b/tools/include/uapi/linux/bpf_common.h
index 18be90725ab04..ee97668bdadb3 100644
--- a/tools/include/uapi/linux/bpf_common.h
+++ b/tools/include/uapi/linux/bpf_common.h
@@ -15,9 +15,10 @@
 
 /* ld/ldx fields */
 #define BPF_SIZE(code)  ((code) & 0x18)
-#define		BPF_W		0x00
-#define		BPF_H		0x08
-#define		BPF_B		0x10
+#define		BPF_W		0x00 /* 32-bit */
+#define		BPF_H		0x08 /* 16-bit */
+#define		BPF_B		0x10 /*  8-bit */
+/* eBPF		BPF_DW		0x18    64-bit */
 #define BPF_MODE(code)  ((code) & 0xe0)
 #define		BPF_IMM		0x00
 #define		BPF_ABS		0x20

From 077c066a6c5445423147496e6c9b9a2c2d2ee762 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:17 +0100
Subject: [PATCH 450/676] tools/libbpf: improve the pr_debug statements to
 contain section numbers

While debugging a bpf ELF loading issue, I needed to correlate the
ELF section number with the failed relocation section reference.
Thus, add section numbers/index to the pr_debug.

In debug mode, also print section that were skipped.  This helped
me identify that a section (.eh_frame) was skipped, and this was
the reason the relocation section (.rel.eh_frame) could not find
that section number.

The section numbers corresponds to the readelf tools Section Headers [Nr].

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index c64840365433c..0c794f7fc0506 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -319,8 +319,8 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx,
 
 	prog->section_name = strdup(section_name);
 	if (!prog->section_name) {
-		pr_warning("failed to alloc name for prog under section %s\n",
-			   section_name);
+		pr_warning("failed to alloc name for prog under section(%d) %s\n",
+			   idx, section_name);
 		goto errout;
 	}
 
@@ -763,29 +763,29 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 
 		idx++;
 		if (gelf_getshdr(scn, &sh) != &sh) {
-			pr_warning("failed to get section header from %s\n",
-				   obj->path);
+			pr_warning("failed to get section(%d) header from %s\n",
+				   idx, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
 
 		name = elf_strptr(elf, ep->e_shstrndx, sh.sh_name);
 		if (!name) {
-			pr_warning("failed to get section name from %s\n",
-				   obj->path);
+			pr_warning("failed to get section(%d) name from %s\n",
+				   idx, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
 
 		data = elf_getdata(scn, 0);
 		if (!data) {
-			pr_warning("failed to get section data from %s(%s)\n",
-				   name, obj->path);
+			pr_warning("failed to get section(%d) data from %s(%s)\n",
+				   idx, name, obj->path);
 			err = -LIBBPF_ERRNO__FORMAT;
 			goto out;
 		}
-		pr_debug("section %s, size %ld, link %d, flags %lx, type=%d\n",
-			 name, (unsigned long)data->d_size,
+		pr_debug("section(%d) %s, size %ld, link %d, flags %lx, type=%d\n",
+			 idx, name, (unsigned long)data->d_size,
 			 (int)sh.sh_link, (unsigned long)sh.sh_flags,
 			 (int)sh.sh_type);
 
@@ -840,6 +840,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 				obj->efile.reloc[n].shdr = sh;
 				obj->efile.reloc[n].data = data;
 			}
+		} else {
+			pr_debug("skip section(%d) %s\n", idx, name);
 		}
 		if (err)
 			goto out;
@@ -1119,8 +1121,7 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 
 		prog = bpf_object__find_prog_by_idx(obj, idx);
 		if (!prog) {
-			pr_warning("relocation failed: no %d section\n",
-				   idx);
+			pr_warning("relocation failed: no section(%d)\n", idx);
 			return -LIBBPF_ERRNO__RELOC;
 		}
 

From 864db336c677c288d81524c30a656804785902f9 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:22 +0100
Subject: [PATCH 451/676] selftests/bpf: add test program for loading BPF ELF
 files

V2: Moved program into selftests/bpf from tools/libbpf

This program can be used on its own for testing/debugging if a
BPF ELF-object file can be loaded with libbpf (from tools/lib/bpf).

If something is wrong with the ELF object, the program have
a --debug mode that will display the ELF sections and especially
the skipped sections.  This allows for quickly identifying the
problematic ELF section number, which can be corrolated with the
readelf tool.

The program signal error via return codes, and also have
a --quiet mode, which is practical for use in scripts like
selftests/bpf.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 .../testing/selftests/bpf/test_libbpf_open.c  | 150 ++++++++++++++++++
 2 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_libbpf_open.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 566d6adc172a6..3f48da0866ba2 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -20,7 +20,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o test_tcpbpf_kern.o
+	sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open
 
 TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
 	test_offload.py
diff --git a/tools/testing/selftests/bpf/test_libbpf_open.c b/tools/testing/selftests/bpf/test_libbpf_open.c
new file mode 100644
index 0000000000000..8fcd1c076add0
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_libbpf_open.c
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ =
+	"Libbpf test program for loading BPF ELF object files";
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include <bpf/libbpf.h>
+#include <getopt.h>
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"debug",	no_argument,		NULL, 'D' },
+	{"quiet",	no_argument,		NULL, 'q' },
+	{0, 0, NULL,  0 }
+};
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n\n", __doc__);
+	printf(" Usage: %s (options-see-below) BPF_FILE\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		printf(" short-option: -%c",
+		       long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+}
+
+#define DEFINE_PRINT_FN(name, enabled) \
+static int libbpf_##name(const char *fmt, ...)  	\
+{							\
+        va_list args;					\
+        int ret;					\
+							\
+        va_start(args, fmt);				\
+	if (enabled) {					\
+		fprintf(stderr, "[" #name "] ");	\
+		ret = vfprintf(stderr, fmt, args);	\
+	}						\
+        va_end(args);					\
+        return ret;					\
+}
+DEFINE_PRINT_FN(warning, 1)
+DEFINE_PRINT_FN(info, 1)
+DEFINE_PRINT_FN(debug, 1)
+
+#define EXIT_FAIL_LIBBPF EXIT_FAILURE
+#define EXIT_FAIL_OPTION 2
+
+int test_walk_progs(struct bpf_object *obj, bool verbose)
+{
+	struct bpf_program *prog;
+	int cnt = 0;
+
+	bpf_object__for_each_program(prog, obj) {
+		cnt++;
+		if (verbose)
+			printf("Prog (count:%d) section_name: %s\n", cnt,
+			       bpf_program__title(prog, false));
+	}
+	return 0;
+}
+
+int test_walk_maps(struct bpf_object *obj, bool verbose)
+{
+	struct bpf_map *map;
+	int cnt = 0;
+
+	bpf_map__for_each(map, obj) {
+		cnt++;
+		if (verbose)
+			printf("Map (count:%d) name: %s\n", cnt,
+			       bpf_map__name(map));
+	}
+	return 0;
+}
+
+int test_open_file(char *filename, bool verbose)
+{
+	struct bpf_object *bpfobj = NULL;
+	long err;
+
+	if (verbose)
+		printf("Open BPF ELF-file with libbpf: %s\n", filename);
+
+	/* Load BPF ELF object file and check for errors */
+	bpfobj = bpf_object__open(filename);
+	err = libbpf_get_error(bpfobj);
+	if (err) {
+		char err_buf[128];
+		libbpf_strerror(err, err_buf, sizeof(err_buf));
+		if (verbose)
+			printf("Unable to load eBPF objects in file '%s': %s\n",
+			       filename, err_buf);
+		return EXIT_FAIL_LIBBPF;
+	}
+	test_walk_progs(bpfobj, verbose);
+	test_walk_maps(bpfobj, verbose);
+
+	if (verbose)
+		printf("Close BPF ELF-file with libbpf: %s\n",
+		       bpf_object__name(bpfobj));
+	bpf_object__close(bpfobj);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	char filename[1024] = { 0 };
+	bool verbose = 1;
+	int longindex = 0;
+	int opt;
+
+	libbpf_set_print(libbpf_warning, libbpf_info, NULL);
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hDq",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'D':
+			libbpf_set_print(libbpf_warning, libbpf_info,
+					 libbpf_debug);
+			break;
+		case 'q': /* Use in scripting mode */
+			verbose = 0;
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	if (optind >= argc) {
+		usage(argv);
+		printf("ERROR: Expected BPF_FILE argument after options\n");
+		return EXIT_FAIL_OPTION;
+	}
+	snprintf(filename, sizeof(filename), "%s", argv[optind]);
+
+	return test_open_file(filename, verbose);
+}

From f09b2e382e9a7053e3ae6f2fb6535efd5760cf5d Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:27 +0100
Subject: [PATCH 452/676] selftests/bpf: add selftest that use test_libbpf_open

This script test_libbpf.sh will be part of the 'make run_tests'
invocation, but can also be invoked manually in this directory,
and a verbose mode can be enabled via setting the environment
variable $VERBOSE like:

 $ VERBOSE=yes ./test_libbpf.sh

The script contains some tests that are commented out, as they
currently fail.  They are reminders about what we need to improve
for the libbpf loader library.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/Makefile       | 14 ++++++-
 tools/testing/selftests/bpf/test_libbpf.sh | 49 ++++++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_libbpf.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3f48da0866ba2..5c43c187f27c1 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,6 +13,7 @@ endif
 CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
 LDLIBS += -lcap -lelf -lrt -lpthread
 
+# Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
 	test_align test_verifier_log test_dev_cgroup test_tcpbpf_user
 
@@ -20,17 +21,26 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o     \
 	sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
-	sample_map_ret0.o test_tcpbpf_kern.o test_libbpf_open
+	sample_map_ret0.o test_tcpbpf_kern.o
 
-TEST_PROGS := test_kmod.sh test_xdp_redirect.sh test_xdp_meta.sh \
+# Order correspond to 'make run_tests' order
+TEST_PROGS := test_kmod.sh \
+	test_libbpf.sh \
+	test_xdp_redirect.sh \
+	test_xdp_meta.sh \
 	test_offload.py
 
+# Compile but not part of 'make run_tests'
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open
+
 include ../lib.mk
 
 BPFOBJ := $(OUTPUT)/libbpf.a cgroup_helpers.c
 
 $(TEST_GEN_PROGS): $(BPFOBJ)
 
+$(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
+
 .PHONY: force
 
 # force a rebuild of BPFOBJ when its dependencies are updated
diff --git a/tools/testing/selftests/bpf/test_libbpf.sh b/tools/testing/selftests/bpf/test_libbpf.sh
new file mode 100755
index 0000000000000..d97dc914cd496
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_libbpf.sh
@@ -0,0 +1,49 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+export TESTNAME=test_libbpf
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+	if (( $? == 0 )); then
+		echo "selftests: $TESTNAME [PASS]";
+	else
+		echo "$TESTNAME: failed at file $LAST_LOADED" 1>&2
+		echo "selftests: $TESTNAME [FAILED]";
+	fi
+}
+
+libbpf_open_file()
+{
+	LAST_LOADED=$1
+	if [ -n "$VERBOSE" ]; then
+	    ./test_libbpf_open $1
+	else
+	    ./test_libbpf_open --quiet $1
+	fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+libbpf_open_file test_l4lb.o
+
+# TODO: fix libbpf to load noinline functions
+# [warning] libbpf: incorrect bpf_call opcode
+#libbpf_open_file test_l4lb_noinline.o
+
+# TODO: fix test_xdp_meta.c to load with libbpf
+# [warning] libbpf: test_xdp_meta.o doesn't provide kernel version
+#libbpf_open_file test_xdp_meta.o
+
+# TODO: fix libbpf to handle .eh_frame
+# [warning] libbpf: relocation failed: no section(10)
+#libbpf_open_file ../../../../samples/bpf/tracex3_kern.o
+
+# Success
+exit 0

From e3d91b0ca523d53158f435a3e13df7f0cb360ea2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 8 Feb 2018 12:48:32 +0100
Subject: [PATCH 453/676] tools/libbpf: handle issues with bpf ELF objects
 containing .eh_frames

V3: More generic skipping of relo-section (suggested by Daniel)

If clang >= 4.0.1 is missing the option '-target bpf', it will cause
llc/llvm to create two ELF sections for "Exception Frames", with
section names '.eh_frame' and '.rel.eh_frame'.

The BPF ELF loader library libbpf fails when loading files with these
sections.  The other in-kernel BPF ELF loader in samples/bpf/bpf_load.c,
handle this gracefully. And iproute2 loader also seems to work with these
"eh" sections.

The issue in libbpf is caused by bpf_object__elf_collect() skipping
some sections, and later when performing relocation it will be
pointing to a skipped section, as these sections cannot be found by
bpf_object__find_prog_by_idx() in bpf_object__collect_reloc().

This is a general issue that also occurs for other sections, like
debug sections which are also skipped and can have relo section.

As suggested by Daniel.  To avoid keeping state about all skipped
sections, instead perform a direct qlookup in the ELF object.  Lookup
the section that the relo-section points to and check if it contains
executable machine instructions (denoted by the sh_flags
SHF_EXECINSTR).  Use this check to also skip irrelevant relo-sections.

Note, for samples/bpf/ the '-target bpf' parameter to clang cannot be used
due to incompatibility with asm embedded headers, that some of the samples
include. This is explained in more details by Yonghong Song in bpf_devel_QA.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/lib/bpf/libbpf.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 0c794f7fc0506..97073d649c1a0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -742,6 +742,24 @@ bpf_object__init_maps(struct bpf_object *obj)
 	return 0;
 }
 
+static bool section_have_execinstr(struct bpf_object *obj, int idx)
+{
+	Elf_Scn *scn;
+	GElf_Shdr sh;
+
+	scn = elf_getscn(obj->efile.elf, idx);
+	if (!scn)
+		return false;
+
+	if (gelf_getshdr(scn, &sh) != &sh)
+		return false;
+
+	if (sh.sh_flags & SHF_EXECINSTR)
+		return true;
+
+	return false;
+}
+
 static int bpf_object__elf_collect(struct bpf_object *obj)
 {
 	Elf *elf = obj->efile.elf;
@@ -825,6 +843,14 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		} else if (sh.sh_type == SHT_REL) {
 			void *reloc = obj->efile.reloc;
 			int nr_reloc = obj->efile.nr_reloc + 1;
+			int sec = sh.sh_info; /* points to other section */
+
+			/* Only do relo for section with exec instructions */
+			if (!section_have_execinstr(obj, sec)) {
+				pr_debug("skip relo %s(%d) for section(%d)\n",
+					 name, idx, sec);
+				continue;
+			}
 
 			reloc = realloc(reloc,
 					sizeof(*obj->efile.reloc) * nr_reloc);

From b81e830c9ad041aba81b495d9386dc31aed3f1e6 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 7 Feb 2018 10:12:04 +0100
Subject: [PATCH 454/676] platform/mellanox: mlxreg-hotplug: Fix uninitialized
 variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc-4.1.2:

    drivers/platform/mellanox/mlxreg-hotplug.c: In function ‘mlxreg_hotplug_health_work_helper’:
    drivers/platform/mellanox/mlxreg-hotplug.c:347: warning: ‘ret’ is used uninitialized in this function

Indeed, if mlxreg_core_item.count is zero, ret is used uninitialized.

While this is unlikely to happen (it is set to ARRAY_SIZE(...) in x86
board files), this is done in another source file, so fix this by
preinitializing ret to zero.

Fixes: c6acad68eb2dbffd ("platform/mellanox: mlxreg-hotplug: Modify to use a regmap interface")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/mellanox/mlxreg-hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/mellanox/mlxreg-hotplug.c b/drivers/platform/mellanox/mlxreg-hotplug.c
index 0dfa1ca0d05b0..313cf8ad77bf6 100644
--- a/drivers/platform/mellanox/mlxreg-hotplug.c
+++ b/drivers/platform/mellanox/mlxreg-hotplug.c
@@ -300,7 +300,7 @@ mlxreg_hotplug_health_work_helper(struct mlxreg_hotplug_priv_data *priv,
 {
 	struct mlxreg_core_data *data = item->data;
 	u32 regval;
-	int i, ret;
+	int i, ret = 0;
 
 	for (i = 0; i < item->count; i++, data++) {
 		/* Mask event. */

From ba814fdd0eac312be9cf204066760ef8d0f57221 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 2 Feb 2018 08:45:45 +0000
Subject: [PATCH 455/676] platform/x86: mlx-platform: Use defines for bus
 assignment

Add defines for the bus IDs, used for hotplug device topology to improve
code readability. Defines added for FAN and power units.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index 27de29961f5e1..60bf758ff3820 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -89,6 +89,13 @@
 /* Number of LPC attached MUX platform devices */
 #define MLXPLAT_CPLD_LPC_MUX_DEVS		2
 
+/* Hotplug devices adapter numbers */
+#define MLXPLAT_CPLD_PSU_DEFAULT_NR		10
+#define MLXPLAT_CPLD_FAN1_DEFAULT_NR		11
+#define MLXPLAT_CPLD_FAN2_DEFAULT_NR		12
+#define MLXPLAT_CPLD_FAN3_DEFAULT_NR		13
+#define MLXPLAT_CPLD_FAN4_DEFAULT_NR		14
+
 /* mlxplat_priv - platform private data
  * @pdev_i2c - i2c controller platform device
  * @pdev_mux - array of mux platform devices
@@ -190,14 +197,14 @@ static struct mlxreg_core_data mlxplat_mlxcpld_default_psu_items_data[] = {
 		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
 		.mask = BIT(0),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_psu[0],
-		.hpdev.nr = 10,
+		.hpdev.nr = MLXPLAT_CPLD_PSU_DEFAULT_NR,
 	},
 	{
 		.label = "psu2",
 		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
 		.mask = BIT(1),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_psu[1],
-		.hpdev.nr = 10,
+		.hpdev.nr = MLXPLAT_CPLD_PSU_DEFAULT_NR,
 	},
 };
 
@@ -207,14 +214,14 @@ static struct mlxreg_core_data mlxplat_mlxcpld_default_pwr_items_data[] = {
 		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
 		.mask = BIT(0),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_pwr[0],
-		.hpdev.nr = 10,
+		.hpdev.nr = MLXPLAT_CPLD_PSU_DEFAULT_NR,
 	},
 	{
 		.label = "pwr2",
 		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
 		.mask = BIT(1),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_pwr[1],
-		.hpdev.nr = 10,
+		.hpdev.nr = MLXPLAT_CPLD_PSU_DEFAULT_NR,
 	},
 };
 
@@ -224,28 +231,28 @@ static struct mlxreg_core_data mlxplat_mlxcpld_default_fan_items_data[] = {
 		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
 		.mask = BIT(0),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_fan[0],
-		.hpdev.nr = 11,
+		.hpdev.nr = MLXPLAT_CPLD_FAN1_DEFAULT_NR,
 	},
 	{
 		.label = "fan2",
 		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
 		.mask = BIT(1),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_fan[1],
-		.hpdev.nr = 12,
+		.hpdev.nr = MLXPLAT_CPLD_FAN2_DEFAULT_NR,
 	},
 	{
 		.label = "fan3",
 		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
 		.mask = BIT(2),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_fan[2],
-		.hpdev.nr = 13,
+		.hpdev.nr = MLXPLAT_CPLD_FAN3_DEFAULT_NR,
 	},
 	{
 		.label = "fan4",
 		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
 		.mask = BIT(3),
 		.hpdev.brdinfo = &mlxplat_mlxcpld_fan[3],
-		.hpdev.nr = 14,
+		.hpdev.nr = MLXPLAT_CPLD_FAN4_DEFAULT_NR,
 	},
 };
 

From 1778567a2063c3027f3ab03748fadab5840eba33 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 2 Feb 2018 08:45:46 +0000
Subject: [PATCH 456/676] platform/x86: mlx-platform: Add define for the
 negative bus

Add define for the negative bus ID in order to use it in case no hotplug
device is associated with the hotplug interrupt signal. In this case,
the signal will be handled by the mlxreg-hotplug driver, but no device
will be associated with the signal.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index 60bf758ff3820..2e9f9e4302b2c 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -90,6 +90,7 @@
 #define MLXPLAT_CPLD_LPC_MUX_DEVS		2
 
 /* Hotplug devices adapter numbers */
+#define MLXPLAT_CPLD_NR_NONE			-1
 #define MLXPLAT_CPLD_PSU_DEFAULT_NR		10
 #define MLXPLAT_CPLD_FAN1_DEFAULT_NR		11
 #define MLXPLAT_CPLD_FAN2_DEFAULT_NR		12

From 6016f7d54bc49e3c570d5c7adbb463fe87e19dd7 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 2 Feb 2018 08:45:47 +0000
Subject: [PATCH 457/676] platform/x86: mlx-platform: Fix power cable setting
 for msn21xx family

Add dedicated structure with power cable setting for Mellanox msn21xx
family. These systems do not have a physical device for the power unit
controller. When the power cable is inserted or removed, the relevant
interrupt signal is handled, the status is updated, but no device is
associated with the signal.

Add definition for interrupt low aggregation signal. On system from
msn21xx family, low aggregation mask should be removed in order to allow
signal to hit CPU.

Fixes: 6613d18e9038 ("platform/x86: mlx-platform: Move module from arch/x86")
Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index 2e9f9e4302b2c..e87fe34eadf5b 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -77,6 +77,8 @@
 #define MLXPLAT_CPLD_AGGR_FAN_MASK_DEF	0x40
 #define MLXPLAT_CPLD_AGGR_MASK_DEF	(MLXPLAT_CPLD_AGGR_PSU_MASK_DEF | \
 					 MLXPLAT_CPLD_AGGR_FAN_MASK_DEF)
+#define MLXPLAT_CPLD_AGGR_MASK_NG_DEF	0x04
+#define MLXPLAT_CPLD_LOW_AGGR_MASK_LOW	0xc0
 #define MLXPLAT_CPLD_AGGR_MASK_MSN21XX	0x04
 #define MLXPLAT_CPLD_PSU_MASK		GENMASK(1, 0)
 #define MLXPLAT_CPLD_PWR_MASK		GENMASK(1, 0)
@@ -295,14 +297,29 @@ struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_default_data = {
 	.mask = MLXPLAT_CPLD_AGGR_MASK_DEF,
 };
 
+static struct mlxreg_core_data mlxplat_mlxcpld_msn21xx_pwr_items_data[] = {
+	{
+		.label = "pwr1",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(0),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "pwr2",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(1),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+};
+
 /* Platform hotplug MSN21xx system family data */
 static struct mlxreg_core_item mlxplat_mlxcpld_msn21xx_items[] = {
 	{
-		.data = mlxplat_mlxcpld_default_pwr_items_data,
+		.data = mlxplat_mlxcpld_msn21xx_pwr_items_data,
 		.aggr_mask = MLXPLAT_CPLD_AGGR_PWR_MASK_DEF,
 		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
 		.mask = MLXPLAT_CPLD_PWR_MASK,
-		.count = ARRAY_SIZE(mlxplat_mlxcpld_pwr),
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_msn21xx_pwr_items_data),
 		.inversed = 0,
 		.health = false,
 	},
@@ -314,6 +331,8 @@ struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn21xx_data = {
 	.counter = ARRAY_SIZE(mlxplat_mlxcpld_msn21xx_items),
 	.cell = MLXPLAT_CPLD_LPC_REG_AGGR_OFFSET,
 	.mask = MLXPLAT_CPLD_AGGR_MASK_DEF,
+	.cell_low = MLXPLAT_CPLD_LPC_REG_AGGRLO_OFFSET,
+	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
 };
 
 static bool mlxplat_mlxcpld_writeable_reg(struct device *dev, unsigned int reg)

From 05f2bb0313a2855e491dadfc8319b7da261d7074 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 7 Feb 2018 19:49:54 +1100
Subject: [PATCH 458/676] KVM: PPC: Book3S HV: Fix handling of secondary HPTEG
 in HPT resizing code

This fixes the computation of the HPTE index to use when the HPT
resizing code encounters a bolted HPTE which is stored in its
secondary HPTE group.  The code inverts the HPTE group number, which
is correct, but doesn't then mask it with new_hash_mask.  As a result,
new_pteg will be effectively negative, resulting in new_hptep
pointing before the new HPT, which will corrupt memory.

In addition, this removes two BUG_ON statements.  The condition that
the BUG_ONs were testing -- that we have computed the hash value
incorrectly -- has never been observed in testing, and if it did
occur, would only affect the guest, not the host.  Given that
BUG_ON should only be used in conditions where the kernel (i.e.
the host kernel, in this case) can't possibly continue execution,
it is not appropriate here.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 966097232d214..d19649960bbf9 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1329,12 +1329,8 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 	}
 
 	new_pteg = hash & new_hash_mask;
-	if (vpte & HPTE_V_SECONDARY) {
-		BUG_ON(~pteg != (hash & old_hash_mask));
-		new_pteg = ~new_pteg;
-	} else {
-		BUG_ON(pteg != (hash & old_hash_mask));
-	}
+	if (vpte & HPTE_V_SECONDARY)
+		new_pteg = ~hash & new_hash_mask;
 
 	new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
 	new_hptep = (__be64 *)(new->virt + (new_idx << 4));

From 790a9df5fbef982f2a6992194fe497dd2b794a3d Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 2 Feb 2018 14:29:08 +1100
Subject: [PATCH 459/676] KVM: PPC: Book3S HV: Make HPT resizing work on POWER9

This adds code to enable the HPT resizing code to work on POWER9,
which uses a slightly modified HPT entry format compared to POWER8.
On POWER9, we convert HPTEs read from the HPT from the new format to
the old format so that the rest of the HPT resizing code can work as
before.  HPTEs written to the new HPT are converted to the new format
as the last step before writing them into the new HPT.

This takes out the checks added by commit bcd3bb63dbc8 ("KVM: PPC:
Book3S HV: Disable HPT resizing on POWER9 for now", 2017-02-18),
now that HPT resizing works on POWER9.

On POWER9, when we pivot to the new HPT, we now call
kvmppc_setup_partition_table() to update the partition table in order
to make the hardware use the new HPT.

[paulus@ozlabs.org - added kvmppc_setup_partition_table() call,
 wrote commit message.]

Tested-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 30 ++++++++++++++++++++++-------
 arch/powerpc/kvm/powerpc.c          |  3 +--
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d19649960bbf9..cb34be7d1a49f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1261,6 +1261,11 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 		/* Nothing to do */
 		goto out;
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		rpte = be64_to_cpu(hptep[1]);
+		vpte = hpte_new_to_old_v(vpte, rpte);
+	}
+
 	/* Unmap */
 	rev = &old->rev[idx];
 	guest_rpte = rev->guest_rpte;
@@ -1290,7 +1295,6 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 
 	/* Reload PTE after unmap */
 	vpte = be64_to_cpu(hptep[0]);
-
 	BUG_ON(vpte & HPTE_V_VALID);
 	BUG_ON(!(vpte & HPTE_V_ABSENT));
 
@@ -1299,6 +1303,12 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 		goto out;
 
 	rpte = be64_to_cpu(hptep[1]);
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		vpte = hpte_new_to_old_v(vpte, rpte);
+		rpte = hpte_new_to_old_r(rpte);
+	}
+
 	pshift = kvmppc_hpte_base_page_shift(vpte, rpte);
 	avpn = HPTE_V_AVPN_VAL(vpte) & ~(((1ul << pshift) - 1) >> 23);
 	pteg = idx / HPTES_PER_GROUP;
@@ -1336,6 +1346,10 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 	new_hptep = (__be64 *)(new->virt + (new_idx << 4));
 
 	replace_vpte = be64_to_cpu(new_hptep[0]);
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		unsigned long replace_rpte = be64_to_cpu(new_hptep[1]);
+		replace_vpte = hpte_new_to_old_v(replace_vpte, replace_rpte);
+	}
 
 	if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 		BUG_ON(new->order >= old->order);
@@ -1351,6 +1365,11 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 		/* Discard the previous HPTE */
 	}
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		rpte = hpte_old_to_new_r(vpte, rpte);
+		vpte = hpte_old_to_new_v(vpte);
+	}
+
 	new_hptep[1] = cpu_to_be64(rpte);
 	new->rev[new_idx].guest_rpte = guest_rpte;
 	/* No need for a barrier, since new HPT isn't active */
@@ -1368,12 +1387,6 @@ static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
 	unsigned  long i;
 	int rc;
 
-	/*
-	 * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
-	 * that POWER9 uses, and could well hit a BUG_ON on POWER9.
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		return -EIO;
 	for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
 		rc = resize_hpt_rehash_hpte(resize, i);
 		if (rc != 0)
@@ -1404,6 +1417,9 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
 
 	synchronize_srcu_expedited(&kvm->srcu);
 
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		kvmppc_setup_partition_table(kvm);
+
 	resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 77eb25abc6016..cf86aeb43fcfb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -633,8 +633,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = 1;
 		break;
 	case KVM_CAP_SPAPR_RESIZE_HPT:
-		/* Disable this on POWER9 until code handles new HPTE format */
-		r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
+		r = !!hv_enabled;
 		break;
 #endif
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE

From d20fe50a7b3c8f936f7347e9cab2e9dd89c1199d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 8 Feb 2018 18:38:53 +0100
Subject: [PATCH 460/676] KVM: PPC: Book3S HV: Branch inside feature section

We ended up with code that did a conditional branch inside a feature
section to code outside of the feature section. Depending on how the
object file gets organized, that might mean we exceed the 14bit
relocation limit for conditional branches:

  arch/powerpc/kvm/built-in.o:arch/powerpc/kvm/book3s_hv_rmhandlers.S:416:(__ftr_alt_97+0x8): relocation truncated to fit: R_PPC64_REL14 against `.text'+1ca4

So instead of doing a conditional branch outside of the feature section,
let's just jump at the end of the same, making the branch very short.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index a7a20b85d8eb8..7d1459e77de2f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -413,10 +413,11 @@ FTR_SECTION_ELSE
 	/* On P9 we use the split_info for coordinating LPCR changes */
 	lwz	r4, KVM_SPLIT_DO_SET(r6)
 	cmpwi	r4, 0
-	beq	63f
+	beq	1f
 	mr	r3, r6
 	bl	kvmhv_p9_set_lpcr
 	nop
+1:
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 63:
 	/* Order load of vcpu after load of vcore */

From 09f984961c137c4b252c368adab7e1c9f035fa59 Mon Sep 17 00:00:00 2001
From: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Date: Sat, 3 Feb 2018 18:24:26 -0200
Subject: [PATCH 461/676] KVM: PPC: Book3S: Add MMIO emulation for VMX
 instructions

This patch provides the MMIO load/store vector indexed
X-Form emulation.

Instructions implemented:
lvx: the quadword in storage addressed by the result of EA &
0xffff_ffff_ffff_fff0 is loaded into VRT.

stvx: the contents of VRS are stored into the quadword in storage
addressed by the result of EA & 0xffff_ffff_ffff_fff0.

Reported-by: Gopesh Kumar Chaudhary <gopchaud@in.ibm.com>
Reported-by: Balamuruhan S <bala24@linux.vnet.ibm.com>
Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h   |   2 +
 arch/powerpc/include/asm/kvm_ppc.h    |   4 +
 arch/powerpc/include/asm/ppc-opcode.h |   6 ++
 arch/powerpc/kvm/emulate_loadstore.c  |  36 +++++++
 arch/powerpc/kvm/powerpc.c            | 150 ++++++++++++++++++++++++++
 5 files changed, 198 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index fef8133becc85..1f53b562726fd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -690,6 +690,7 @@ struct kvm_vcpu_arch {
 	u8 mmio_vsx_offset;
 	u8 mmio_vsx_copy_type;
 	u8 mmio_vsx_tx_sx_enabled;
+	u8 mmio_vmx_copy_nums;
 	u8 osi_needed;
 	u8 osi_enabled;
 	u8 papr_enabled;
@@ -804,6 +805,7 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_QPR	0x0040
 #define KVM_MMIO_REG_FQPR	0x0060
 #define KVM_MMIO_REG_VSX	0x0080
+#define KVM_MMIO_REG_VMX	0x00c0
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 941c2a3f231b9..28c2030035193 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -81,6 +81,10 @@ extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				unsigned int rt, unsigned int bytes,
 			int is_default_endian, int mmio_sign_extend);
+extern int kvmppc_handle_load128_by2x64(struct kvm_run *run,
+		struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian);
+extern int kvmppc_handle_store128_by2x64(struct kvm_run *run,
+		struct kvm_vcpu *vcpu, unsigned int rs, int is_default_endian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			       u64 val, unsigned int bytes,
 			       int is_default_endian);
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index ce0930d68857c..a51febca08c52 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -156,6 +156,12 @@
 #define OP_31_XOP_LFDX          599
 #define OP_31_XOP_LFDUX		631
 
+/* VMX Vector Load Instructions */
+#define OP_31_XOP_LVX           103
+
+/* VMX Vector Store Instructions */
+#define OP_31_XOP_STVX          231
+
 #define OP_LWZ  32
 #define OP_STFS 52
 #define OP_STFSU 53
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index af833531af319..a382e15135e6d 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -58,6 +58,18 @@ static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_VSX */
 
+#ifdef CONFIG_ALTIVEC
+static bool kvmppc_check_altivec_disabled(struct kvm_vcpu *vcpu)
+{
+	if (!(kvmppc_get_msr(vcpu) & MSR_VEC)) {
+		kvmppc_core_queue_vec_unavail(vcpu);
+		return true;
+	}
+
+	return false;
+}
+#endif /* CONFIG_ALTIVEC */
+
 /*
  * XXX to do:
  * lfiwax, lfiwzx
@@ -98,6 +110,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
 	vcpu->arch.mmio_sp64_extend = 0;
 	vcpu->arch.mmio_sign_extend = 0;
+	vcpu->arch.mmio_vmx_copy_nums = 0;
 
 	switch (get_op(inst)) {
 	case 31:
@@ -459,6 +472,29 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 							 rs, 4, 1);
 			break;
 #endif /* CONFIG_VSX */
+
+#ifdef CONFIG_ALTIVEC
+		case OP_31_XOP_LVX:
+			if (kvmppc_check_altivec_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.vaddr_accessed &= ~0xFULL;
+			vcpu->arch.paddr_accessed &= ~0xFULL;
+			vcpu->arch.mmio_vmx_copy_nums = 2;
+			emulated = kvmppc_handle_load128_by2x64(run, vcpu,
+					KVM_MMIO_REG_VMX|rt, 1);
+			break;
+
+		case OP_31_XOP_STVX:
+			if (kvmppc_check_altivec_disabled(vcpu))
+				return EMULATE_DONE;
+			vcpu->arch.vaddr_accessed &= ~0xFULL;
+			vcpu->arch.paddr_accessed &= ~0xFULL;
+			vcpu->arch.mmio_vmx_copy_nums = 2;
+			emulated = kvmppc_handle_store128_by2x64(run, vcpu,
+					rs, 1);
+			break;
+#endif /* CONFIG_ALTIVEC */
+
 		default:
 			emulated = EMULATE_FAIL;
 			break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf86aeb43fcfb..47c7a302fd031 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -924,6 +924,34 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
 }
 #endif /* CONFIG_VSX */
 
+#ifdef CONFIG_ALTIVEC
+static inline void kvmppc_set_vmx_dword(struct kvm_vcpu *vcpu,
+		u64 gpr)
+{
+	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
+	u32 hi, lo;
+	u32 di;
+
+#ifdef __BIG_ENDIAN
+	hi = gpr >> 32;
+	lo = gpr & 0xffffffff;
+#else
+	lo = gpr >> 32;
+	hi = gpr & 0xffffffff;
+#endif
+
+	di = 2 - vcpu->arch.mmio_vmx_copy_nums;		/* doubleword index */
+	if (di > 1)
+		return;
+
+	if (vcpu->arch.mmio_host_swabbed)
+		di = 1 - di;
+
+	VCPU_VSX_VR(vcpu, index).u[di * 2] = hi;
+	VCPU_VSX_VR(vcpu, index).u[di * 2 + 1] = lo;
+}
+#endif /* CONFIG_ALTIVEC */
+
 #ifdef CONFIG_PPC_FPU
 static inline u64 sp_to_dp(u32 fprs)
 {
@@ -1026,6 +1054,11 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 				KVMPPC_VSX_COPY_DWORD_LOAD_DUMP)
 			kvmppc_set_vsr_dword_dump(vcpu, gpr);
 		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case KVM_MMIO_REG_VMX:
+		kvmppc_set_vmx_dword(vcpu, gpr);
+		break;
 #endif
 	default:
 		BUG();
@@ -1302,6 +1335,111 @@ static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu,
 }
 #endif /* CONFIG_VSX */
 
+#ifdef CONFIG_ALTIVEC
+/* handle quadword load access in two halves */
+int kvmppc_handle_load128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu,
+		unsigned int rt, int is_default_endian)
+{
+	enum emulation_result emulated;
+
+	while (vcpu->arch.mmio_vmx_copy_nums) {
+		emulated = __kvmppc_handle_load(run, vcpu, rt, 8,
+				is_default_endian, 0);
+
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+		vcpu->arch.mmio_vmx_copy_nums--;
+	}
+
+	return emulated;
+}
+
+static inline int kvmppc_get_vmx_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
+{
+	vector128 vrs = VCPU_VSX_VR(vcpu, rs);
+	u32 di;
+	u64 w0, w1;
+
+	di = 2 - vcpu->arch.mmio_vmx_copy_nums;		/* doubleword index */
+	if (di > 1)
+		return -1;
+
+	if (vcpu->arch.mmio_host_swabbed)
+		di = 1 - di;
+
+	w0 = vrs.u[di * 2];
+	w1 = vrs.u[di * 2 + 1];
+
+#ifdef __BIG_ENDIAN
+	*val = (w0 << 32) | w1;
+#else
+	*val = (w1 << 32) | w0;
+#endif
+	return 0;
+}
+
+/* handle quadword store in two halves */
+int kvmppc_handle_store128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu,
+		unsigned int rs, int is_default_endian)
+{
+	u64 val = 0;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	vcpu->arch.io_gpr = rs;
+
+	while (vcpu->arch.mmio_vmx_copy_nums) {
+		if (kvmppc_get_vmx_data(vcpu, rs, &val) == -1)
+			return EMULATE_FAIL;
+
+		emulated = kvmppc_handle_store(run, vcpu, val, 8,
+				is_default_endian);
+		if (emulated != EMULATE_DONE)
+			break;
+
+		vcpu->arch.paddr_accessed += run->mmio.len;
+		vcpu->arch.mmio_vmx_copy_nums--;
+	}
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mmio_vmx_loadstore(struct kvm_vcpu *vcpu,
+		struct kvm_run *run)
+{
+	enum emulation_result emulated = EMULATE_FAIL;
+	int r;
+
+	vcpu->arch.paddr_accessed += run->mmio.len;
+
+	if (!vcpu->mmio_is_write) {
+		emulated = kvmppc_handle_load128_by2x64(run, vcpu,
+				vcpu->arch.io_gpr, 1);
+	} else {
+		emulated = kvmppc_handle_store128_by2x64(run, vcpu,
+				vcpu->arch.io_gpr, 1);
+	}
+
+	switch (emulated) {
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		r = RESUME_HOST;
+		break;
+	case EMULATE_FAIL:
+		pr_info("KVM: MMIO emulation failed (VMX repeat)\n");
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		r = RESUME_HOST;
+		break;
+	default:
+		r = RESUME_GUEST;
+		break;
+	}
+	return r;
+}
+#endif /* CONFIG_ALTIVEC */
+
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
 	int r = 0;
@@ -1420,6 +1558,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 				return r;
 			}
 		}
+#endif
+#ifdef CONFIG_ALTIVEC
+		if (vcpu->arch.mmio_vmx_copy_nums > 0)
+			vcpu->arch.mmio_vmx_copy_nums--;
+
+		if (vcpu->arch.mmio_vmx_copy_nums > 0) {
+			r = kvmppc_emulate_mmio_vmx_loadstore(vcpu, run);
+			if (r == RESUME_HOST) {
+				vcpu->mmio_needed = 1;
+				return r;
+			}
+		}
 #endif
 	} else if (vcpu->arch.osi_needed) {
 		u64 *gprs = run->osi.gprs;

From 99ce7962d52d1948ad6f2785e308d48e76e0a6ef Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Feb 2018 14:02:32 +0100
Subject: [PATCH 462/676] objtool: Fix switch-table detection

Linus reported that GCC-7.3 generated a switch-table construct that
confused objtool. It turns out that, in particular due to KASAN, it is
possible to have unrelated .rodata usage in between the .rodata setup
for the switch-table and the following indirect jump.

The simple linear reverse search from the indirect jump would hit upon
the KASAN .rodata usage first and fail to find a switch_table,
resulting in a spurious 'sibling call with modified stack frame'
warning.

Fix this by creating a 'jump-stack' which we can 'unwind' during
reversal, thereby skipping over much of the in-between code.

This is not fool proof by any means, but is sufficient to make the
known cases work. Future work would be to construct more comprehensive
flow analysis code.

Reported-and-tested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180208130232.GF25235@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 41 +++++++++++++++++++++++++++++++++++++++--
 tools/objtool/check.h |  1 +
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 9cd028aa15098..2e458eb45586c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -851,8 +851,14 @@ static int add_switch_table(struct objtool_file *file, struct symbol *func,
  *    This is a fairly uncommon pattern which is new for GCC 6.  As of this
  *    writing, there are 11 occurrences of it in the allmodconfig kernel.
  *
+ *    As of GCC 7 there are quite a few more of these and the 'in between' code
+ *    is significant. Esp. with KASAN enabled some of the code between the mov
+ *    and jmpq uses .rodata itself, which can confuse things.
+ *
  *    TODO: Once we have DWARF CFI and smarter instruction decoding logic,
  *    ensure the same register is used in the mov and jump instructions.
+ *
+ *    NOTE: RETPOLINE made it harder still to decode dynamic jumps.
  */
 static struct rela *find_switch_table(struct objtool_file *file,
 				      struct symbol *func,
@@ -874,12 +880,25 @@ static struct rela *find_switch_table(struct objtool_file *file,
 						text_rela->addend + 4);
 		if (!rodata_rela)
 			return NULL;
+
 		file->ignore_unreachables = true;
 		return rodata_rela;
 	}
 
 	/* case 3 */
-	func_for_each_insn_continue_reverse(file, func, insn) {
+	/*
+	 * Backward search using the @first_jump_src links, these help avoid
+	 * much of the 'in between' code. Which avoids us getting confused by
+	 * it.
+	 */
+	for (insn = list_prev_entry(insn, list);
+
+	     &insn->list != &file->insn_list &&
+	     insn->sec == func->sec &&
+	     insn->offset >= func->offset;
+
+	     insn = insn->first_jump_src ?: list_prev_entry(insn, list)) {
+
 		if (insn->type == INSN_JUMP_DYNAMIC)
 			break;
 
@@ -909,14 +928,32 @@ static struct rela *find_switch_table(struct objtool_file *file,
 	return NULL;
 }
 
+
 static int add_func_switch_tables(struct objtool_file *file,
 				  struct symbol *func)
 {
-	struct instruction *insn, *prev_jump = NULL;
+	struct instruction *insn, *last = NULL, *prev_jump = NULL;
 	struct rela *rela, *prev_rela = NULL;
 	int ret;
 
 	func_for_each_insn(file, func, insn) {
+		if (!last)
+			last = insn;
+
+		/*
+		 * Store back-pointers for unconditional forward jumps such
+		 * that find_switch_table() can back-track using those and
+		 * avoid some potentially confusing code.
+		 */
+		if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest &&
+		    insn->offset > last->offset &&
+		    insn->jump_dest->offset > insn->offset &&
+		    !insn->jump_dest->first_jump_src) {
+
+			insn->jump_dest->first_jump_src = insn;
+			last = insn->jump_dest;
+		}
+
 		if (insn->type != INSN_JUMP_DYNAMIC)
 			continue;
 
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index dbadb304a410a..23a1d065cae19 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -47,6 +47,7 @@ struct instruction {
 	bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
+	struct instruction *first_jump_src;
 	struct list_head alts;
 	struct symbol *func;
 	struct stack_op stack_op;

From 9890bda14d7de44bce7d18a410768290194e44a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Feb 2018 14:02:32 +0100
Subject: [PATCH 463/676] MAINTAINERS: Add Peter Zijlstra as objtool
 co-maintainer

Since Josh keeps asking, add myself to MAINTAINERS.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 845fc25812f1d..98a22cb607731 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9813,6 +9813,7 @@ F:	drivers/nfc/nxp-nci
 
 OBJTOOL
 M:	Josh Poimboeuf <jpoimboe@redhat.com>
+M:	Peter Zijlstra <peterz@infradead.org>
 S:	Supported
 F:	tools/objtool/
 

From b9058afcd6c725a8d293bb0a1b30bad68113235e Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ti.com>
Date: Fri, 9 Feb 2018 14:43:49 +0100
Subject: [PATCH 464/676] video: omapfb: fix missing #includes

The omapfb driver fails to build after commit 23c35f48f5fb
("pinctrl: remove include file from <linux/device.h>") because it
relies on the <linux/pinctrl/consumer.h> and <linux/seq_file.h>
being pulled in by the <linux/device.h> header implicitly.

Include these headers explicitly to avoid the build failures.

Fixes: 23c35f48f5fb ("pinctrl: remove include file from <linux/device.h>")
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
Tested-by: Tony Lindgren <tony@atomide.com>
[b.zolnierkie: fix include order and patch description]
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/omap2/omapfb/dss/dss.c      | 1 +
 drivers/video/fbdev/omap2/omapfb/dss/hdmi_phy.c | 2 ++
 drivers/video/fbdev/omap2/omapfb/dss/hdmi_pll.c | 1 +
 drivers/video/fbdev/omap2/omapfb/dss/hdmi_wp.c  | 2 ++
 4 files changed, 6 insertions(+)

diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss.c b/drivers/video/fbdev/omap2/omapfb/dss/dss.c
index 39fe7247ff984..f0cac9e0eb944 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dss.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dss.c
@@ -40,6 +40,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/suspend.h>
 #include <linux/component.h>
+#include <linux/pinctrl/consumer.h>
 
 #include <video/omapfb_dss.h>
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_phy.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_phy.c
index eee09b4bdcf63..11dbc05d5720a 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_phy.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_phy.c
@@ -13,6 +13,8 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/seq_file.h>
+
 #include <video/omapfb_dss.h>
 
 #include "dss.h"
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_pll.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_pll.c
index eac3665aba6cb..bc591fc12aefa 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_pll.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_pll.c
@@ -16,6 +16,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <linux/clk.h>
+#include <linux/seq_file.h>
 
 #include <video/omapfb_dss.h>
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_wp.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_wp.c
index 705373e4cf386..4af6ba220744d 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi_wp.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi_wp.c
@@ -14,6 +14,8 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/seq_file.h>
+
 #include <video/omapfb_dss.h>
 
 #include "dss.h"

From 0afa6b4412988019db14c6bfb8c6cbdf120ca9ad Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 9 Feb 2018 09:39:42 -0500
Subject: [PATCH 465/676] SUNRPC: Don't call __UDPX_INC_STATS() from a
 preemptible context

Calling __UDPX_INC_STATS() from a preemptible context leads to a
warning of the form:

 BUG: using __this_cpu_add() in preemptible [00000000] code: kworker/u5:0/31
 caller is xs_udp_data_receive_workfn+0x194/0x270
 CPU: 1 PID: 31 Comm: kworker/u5:0 Not tainted 4.15.0-rc8-00076-g90ea9f1 #2
 Workqueue: xprtiod xs_udp_data_receive_workfn
 Call Trace:
  dump_stack+0x85/0xc1
  check_preemption_disabled+0xce/0xe0
  xs_udp_data_receive_workfn+0x194/0x270
  process_one_work+0x318/0x620
  worker_thread+0x20a/0x390
  ? process_one_work+0x620/0x620
  kthread+0x120/0x130
  ? __kthread_bind_mask+0x60/0x60
  ret_from_fork+0x24/0x30

Since we're taking a spinlock in those functions anyway, let's fix the
issue by moving the call so that it occurs under the spinlock.

Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5d0108172ed38..a6b8c1f8f92a8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1068,18 +1068,18 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
 
 	/* Suck it into the iovec, verify checksum if not done by hw. */
 	if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
-		__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
 		spin_lock(&xprt->recv_lock);
+		__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
 		goto out_unpin;
 	}
 
-	__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
 
 	spin_lock_bh(&xprt->transport_lock);
 	xprt_adjust_cwnd(xprt, task, copied);
 	spin_unlock_bh(&xprt->transport_lock);
 	spin_lock(&xprt->recv_lock);
 	xprt_complete_rqst(task, copied);
+	__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
 out_unpin:
 	xprt_unpin_rqst(rovr);
  out_unlock:

From 26d99834f89e76514076d9cd06f61e56e6a509b8 Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Mon, 22 Jan 2018 22:02:05 +0100
Subject: [PATCH 466/676] 9p/trans_virtio: discard zero-length reply

When a 9p request is successfully flushed, the server is expected to just
mark it as used without sending a 9p reply (ie, without writing data into
the buffer). In this case, virtqueue_get_buf() will return len == 0 and
we must not report a REQ_STATUS_RCVD status to the client, otherwise the
client will erroneously assume the request has not been flushed.

Cc: stable@vger.kernel.org
Signed-off-by: Greg Kurz <groug@kaod.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/9p/trans_virtio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index f3a4efcf14564..3aa5a93ad107c 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -160,7 +160,8 @@ static void req_done(struct virtqueue *vq)
 		spin_unlock_irqrestore(&chan->lock, flags);
 		/* Wakeup if anyone waiting for VirtIO ring space. */
 		wake_up(chan->vc_wq);
-		p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
+		if (len)
+			p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
 	}
 }
 

From faefaa97215a0c05105d7ae180fe1a3b5979ad1f Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:41:09 -0600
Subject: [PATCH 467/676] ibmvnic: Reset long term map ID counter

When allocating RX or TX buffer pools, the driver needs to provide a
unique mapping ID to firmware for each pool. This value is assigned
using a counter which is incremented after a new pool is created. The
ID can be an integer ranging from 1-255. When migrating to a device
that requests a different number of queues, this value was not being
reset properly. As a result, after enough migrations, the counter
exceeded the upper bound and pool creation failed. This is fixed by
resetting the counter to one in this case.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index dd4a2946e1da1..ce127a72cda23 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1644,6 +1644,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
 				return rc;
 		} else if (adapter->req_rx_queues != old_num_rx_queues ||
 			   adapter->req_tx_queues != old_num_tx_queues) {
+			adapter->map_id = 1;
 			release_rx_pools(adapter);
 			release_tx_pools(adapter);
 			init_rx_pools(netdev);

From 1b84ca187510f60f00f4e15255043ce19bb30410 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:45 +0100
Subject: [PATCH 468/676] net: stmmac: discard disabled flags in interrupt
 status register

The interrupt status register in both dwmac1000 and dwmac4 ignores
interrupt enable (for dwmac4) / interrupt mask (for dwmac1000).
Therefore, if we want to check only the bits that can actually trigger
an irq, we have to filter the interrupt status register manually.

Commit 0a764db10337 ("stmmac: Discard masked flags in interrupt status
register") fixed this for dwmac1000. Fix the same issue for dwmac4.

Just like commit 0a764db10337 ("stmmac: Discard masked flags in
interrupt status register"), this makes sure that we do not get
spurious link up/link down prints.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index ed222b20fcf19..1e0a7668b7529 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -572,10 +572,12 @@ static int dwmac4_irq_status(struct mac_device_info *hw,
 			     struct stmmac_extra_stats *x)
 {
 	void __iomem *ioaddr = hw->pcsr;
-	u32 intr_status;
+	u32 intr_status = readl(ioaddr + GMAC_INT_STATUS);
+	u32 intr_enable = readl(ioaddr + GMAC_INT_EN);
 	int ret = 0;
 
-	intr_status = readl(ioaddr + GMAC_INT_STATUS);
+	/* Discard disabled bits */
+	intr_status &= intr_enable;
 
 	/* Not used events (e.g. MMC interrupts) are not handled. */
 	if ((intr_status & mmc_tx_irq))

From e879b7ab3739d8f9990961fc7df2f63861bd780b Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:46 +0100
Subject: [PATCH 469/676] net: stmmac: rename GMAC_INT_DEFAULT_MASK for dwmac4

GMAC_INT_DEFAULT_MASK is written to the interrupt enable register.
In previous versions of the IP (e.g. dwmac1000), this register was
instead an interrupt mask register.
To improve clarity and reflect reality, rename GMAC_INT_DEFAULT_MASK
to GMAC_INT_DEFAULT_ENABLE.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 789dad8a07b5c..7761a26ec9c56 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -98,7 +98,7 @@
 #define	GMAC_PCS_IRQ_DEFAULT	(GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK |	\
 				 GMAC_INT_PCS_ANE)
 
-#define	GMAC_INT_DEFAULT_MASK	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN)
+#define	GMAC_INT_DEFAULT_ENABLE	(GMAC_INT_PMT_EN | GMAC_INT_LPI_EN)
 
 enum dwmac4_irq_status {
 	time_stamp_irq = 0x00001000,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 1e0a7668b7529..6badc63d8e6d3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -61,8 +61,8 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	writel(value, ioaddr + GMAC_CONFIG);
 
-	/* Mask GMAC interrupts */
-	value = GMAC_INT_DEFAULT_MASK;
+	/* Enable GMAC interrupts */
+	value = GMAC_INT_DEFAULT_ENABLE;
 	if (hw->pmt)
 		value |= GMAC_INT_PMT_EN;
 	if (hw->pcs)

From 1029117127540fef4edcf4f0887dc3e1f7d5adb2 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Fri, 9 Feb 2018 17:22:47 +0100
Subject: [PATCH 470/676] net: stmmac: remove redundant enable of PMT irq

For dwmac4, GMAC_INT_DEFAULT_ENABLE already includes
GMAC_INT_PMT_EN, so it is redundant to check if hw->pmt
is set, and if so, setting the bit again.

For dwmac1000, GMAC_INT_DEFAULT_MASK does not include
GMAC_INT_DISABLE_PMT, so it is redundant to check if
hw->pmt is set, and if so, clearing an already cleared bit.

Improve code readability by removing this redundant code.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 --
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c    | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 540d21786a43b..ef10baf141862 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -74,8 +74,6 @@ static void dwmac1000_core_init(struct mac_device_info *hw,
 	/* Mask GMAC interrupts */
 	value = GMAC_INT_DEFAULT_MASK;
 
-	if (hw->pmt)
-		value &= ~GMAC_INT_DISABLE_PMT;
 	if (hw->pcs)
 		value &= ~GMAC_INT_DISABLE_PCS;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 6badc63d8e6d3..63795ecafc8dc 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -63,8 +63,7 @@ static void dwmac4_core_init(struct mac_device_info *hw,
 
 	/* Enable GMAC interrupts */
 	value = GMAC_INT_DEFAULT_ENABLE;
-	if (hw->pmt)
-		value |= GMAC_INT_PMT_EN;
+
 	if (hw->pcs)
 		value |= GMAC_PCS_IRQ_DEFAULT;
 

From 6e6e41c3112276288ccaf80c70916779b84bb276 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Feb 2018 17:45:49 +0800
Subject: [PATCH 471/676] ptr_ring: fail early if queue occupies more than
 KMALLOC_MAX_SIZE

To avoid slab to warn about exceeded size, fail early if queue
occupies more than KMALLOC_MAX_SIZE.

Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com
Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 1883d6137e9b7..6051a5fe69131 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -466,6 +466,8 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 
 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
+	if (size * sizeof(void *) > KMALLOC_MAX_SIZE)
+		return NULL;
 	return kcalloc(size, sizeof(void *), gfp);
 }
 

From 0bf7800f1799b5b1fd7d4f024e9ece53ac489011 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 9 Feb 2018 17:45:50 +0800
Subject: [PATCH 472/676] ptr_ring: try vmalloc() when kmalloc() fails

This patch switch to use kvmalloc_array() for using a vmalloc()
fallback to help in case kmalloc() fails.

Reported-by: syzbot+e4d4f9ddd4295539735d@syzkaller.appspotmail.com
Fixes: 2e0ab8ca83c12 ("ptr_ring: array based FIFO for pointers")
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptr_ring.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6051a5fe69131..b884b7794187e 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -464,11 +464,14 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 	__PTR_RING_PEEK_CALL_v; \
 })
 
+/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
+ * documentation for vmalloc for which of them are legal.
+ */
 static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
 {
 	if (size * sizeof(void *) > KMALLOC_MAX_SIZE)
 		return NULL;
-	return kcalloc(size, sizeof(void *), gfp);
+	return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -603,7 +606,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 	spin_unlock(&(r)->producer_lock);
 	spin_unlock_irqrestore(&(r)->consumer_lock, flags);
 
-	kfree(old);
+	kvfree(old);
 
 	return 0;
 }
@@ -643,7 +646,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
 	}
 
 	for (i = 0; i < nrings; ++i)
-		kfree(queues[i]);
+		kvfree(queues[i]);
 
 	kfree(queues);
 
@@ -651,7 +654,7 @@ static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
 
 nomem:
 	while (--i >= 0)
-		kfree(queues[i]);
+		kvfree(queues[i]);
 
 	kfree(queues);
 
@@ -666,7 +669,7 @@ static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
 	if (destroy)
 		while ((ptr = ptr_ring_consume(r)))
 			destroy(ptr);
-	kfree(r->queue);
+	kvfree(r->queue);
 }
 
 #endif /* _LINUX_PTR_RING_H  */

From 89271c65edd599207dd982007900506283c90ae3 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:03:49 +0100
Subject: [PATCH 473/676] s390/qeth: fix underestimated count of buffer
 elements

For a memory range/skb where the last byte falls onto a page boundary
(ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the
calculation currently doesn't round up to the next PFN due to an
off-by-one error.
Thus qeth believes that the skb occupies one page less than it
actually does, and may select a IO buffer that doesn't have enough spare
buffer elements to fit all of the skb's data.
HW detects this as a malformed buffer descriptor, and raises an
exception which then triggers device recovery.

Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count")
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index db42107bf2f5f..c33fbc4c2e919 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -846,7 +846,7 @@ struct qeth_trap_id {
  */
 static inline int qeth_get_elements_for_range(addr_t start, addr_t end)
 {
-	return PFN_UP(end - 1) - PFN_DOWN(start);
+	return PFN_UP(end) - PFN_DOWN(start);
 }
 
 static inline int qeth_get_micros(void)

From 1c5b2216fbb973a9410e0b06389740b5c1289171 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 11:03:50 +0100
Subject: [PATCH 474/676] s390/qeth: fix SETIP command handling

send_control_data() applies some special handling to SETIP v4 IPA
commands. But current code parses *all* command types for the SETIP
command code. Limit the command code check to IPA commands.

Fixes: 5b54e16f1a54 ("qeth: do not spin for SETIP ip assist command")
Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/qeth_core.h      |  5 +++++
 drivers/s390/net/qeth_core_main.c | 14 ++++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index c33fbc4c2e919..959c65cf75d94 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -591,6 +591,11 @@ struct qeth_cmd_buffer {
 	void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *);
 };
 
+static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob)
+{
+	return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE);
+}
+
 /**
  * definition of a qeth channel, used for read and write
  */
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 6abd3bc285e4f..ca72f3311004a 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -2120,7 +2120,7 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 	unsigned long flags;
 	struct qeth_reply *reply = NULL;
 	unsigned long timeout, event_timeout;
-	struct qeth_ipa_cmd *cmd;
+	struct qeth_ipa_cmd *cmd = NULL;
 
 	QETH_CARD_TEXT(card, 2, "sendctl");
 
@@ -2146,10 +2146,13 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 	while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ;
 	qeth_prepare_control_data(card, len, iob);
 
-	if (IS_IPA(iob->data))
+	if (IS_IPA(iob->data)) {
+		cmd = __ipa_cmd(iob);
 		event_timeout = QETH_IPA_TIMEOUT;
-	else
+	} else {
 		event_timeout = QETH_TIMEOUT;
+	}
+
 	timeout = jiffies + event_timeout;
 
 	QETH_CARD_TEXT(card, 6, "noirqpnd");
@@ -2174,9 +2177,8 @@ int qeth_send_control_data(struct qeth_card *card, int len,
 
 	/* we have only one long running ipassist, since we can ensure
 	   process context of this command we can sleep */
-	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
-	if ((cmd->hdr.command == IPA_CMD_SETIP) &&
-	    (cmd->hdr.prot_version == QETH_PROT_IPV4)) {
+	if (cmd && cmd->hdr.command == IPA_CMD_SETIP &&
+	    cmd->hdr.prot_version == QETH_PROT_IPV4) {
 		if (!wait_event_timeout(reply->wait_q,
 		    atomic_read(&reply->received), event_timeout))
 			goto time_err;

From 07f2c7ab6f8d0a7e7c5764c4e6cc9c52951b9d9c Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 9 Feb 2018 17:35:23 +0300
Subject: [PATCH 475/676] sctp: verify size of a new chunk in
 _sctp_make_chunk()

When SCTP makes INIT or INIT_ACK packet the total chunk length
can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when
transmitting these packets, e.g. the crash on sending INIT_ACK:

[  597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168
               put:120156 head:000000007aa47635 data:00000000d991c2de
               tail:0x1d640 end:0xfec0 dev:<NULL>
...
[  597.976970] ------------[ cut here ]------------
[  598.033408] kernel BUG at net/core/skbuff.c:104!
[  600.314841] Call Trace:
[  600.345829]  <IRQ>
[  600.371639]  ? sctp_packet_transmit+0x2095/0x26d0 [sctp]
[  600.436934]  skb_put+0x16c/0x200
[  600.477295]  sctp_packet_transmit+0x2095/0x26d0 [sctp]
[  600.540630]  ? sctp_packet_config+0x890/0x890 [sctp]
[  600.601781]  ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp]
[  600.671356]  ? sctp_cmp_addr_exact+0x3f/0x90 [sctp]
[  600.731482]  sctp_outq_flush+0x663/0x30d0 [sctp]
[  600.788565]  ? sctp_make_init+0xbf0/0xbf0 [sctp]
[  600.845555]  ? sctp_check_transmitted+0x18f0/0x18f0 [sctp]
[  600.912945]  ? sctp_outq_tail+0x631/0x9d0 [sctp]
[  600.969936]  sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp]
[  601.041593]  ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp]
[  601.104837]  ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp]
[  601.175436]  ? sctp_eat_data+0x1710/0x1710 [sctp]
[  601.233575]  sctp_do_sm+0x182/0x560 [sctp]
[  601.284328]  ? sctp_has_association+0x70/0x70 [sctp]
[  601.345586]  ? sctp_rcv+0xef4/0x32f0 [sctp]
[  601.397478]  ? sctp6_rcv+0xa/0x20 [sctp]
...

Here the chunk size for INIT_ACK packet becomes too big, mostly
because of the state cookie (INIT packet has large size with
many address parameters), plus additional server parameters.

Later this chunk causes the panic in skb_put_data():

  skb_packet_transmit()
      sctp_packet_pack()
          skb_put_data(nskb, chunk->skb->data, chunk->skb->len);

'nskb' (head skb) was previously allocated with packet->size
from u16 'chunk->chunk_hdr->length'.

As suggested by Marcelo we should check the chunk's length in
_sctp_make_chunk() before trying to allocate skb for it and
discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN.

Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leinter@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 793b05ec692be..d01475f5f7106 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1380,9 +1380,14 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
 	struct sctp_chunk *retval;
 	struct sk_buff *skb;
 	struct sock *sk;
+	int chunklen;
+
+	chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen);
+	if (chunklen > SCTP_MAX_CHUNK_LEN)
+		goto nodata;
 
 	/* No need to allocate LL here, as this is only a chunk. */
-	skb = alloc_skb(SCTP_PAD4(sizeof(*chunk_hdr) + paylen), gfp);
+	skb = alloc_skb(chunklen, gfp);
 	if (!skb)
 		goto nodata;
 

From 941ff6f11c020913f5cddf543a9ec63475d7c082 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Feb 2018 14:49:44 +0100
Subject: [PATCH 476/676] bpf: fix rlimit in reuseport net selftest

Fix two issues in the reuseport_bpf selftests that were
reported by Linaro CI:

  [...]
  + ./reuseport_bpf
  ---- IPv4 UDP ----
  Testing EBPF mod 10...
  Reprograming, testing mod 5...
  ./reuseport_bpf: ebpf error. log:
  0: (bf) r6 = r1
  1: (20) r0 = *(u32 *)skb[0]
  2: (97) r0 %= 10
  3: (95) exit
  processed 4 insns
  : Operation not permitted
  + echo FAIL
  [...]
  ---- IPv4 TCP ----
  Testing EBPF mod 10...
  ./reuseport_bpf: failed to bind send socket: Address already in use
  + echo FAIL
  [...]

For the former adjust rlimit since this was the cause of
failure for loading the BPF prog, and for the latter add
SO_REUSEADDR.

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Link: https://bugs.linaro.org/show_bug.cgi?id=3502
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/reuseport_bpf.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c
index 4a8217448f201..cad14cd0ea922 100644
--- a/tools/testing/selftests/net/reuseport_bpf.c
+++ b/tools/testing/selftests/net/reuseport_bpf.c
@@ -21,6 +21,7 @@
 #include <sys/epoll.h>
 #include <sys/types.h>
 #include <sys/socket.h>
+#include <sys/resource.h>
 #include <unistd.h>
 
 #ifndef ARRAY_SIZE
@@ -190,11 +191,14 @@ static void send_from(struct test_params p, uint16_t sport, char *buf,
 	struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport);
 	struct sockaddr * const daddr =
 		new_loopback_sockaddr(p.send_family, p.recv_port);
-	const int fd = socket(p.send_family, p.protocol, 0);
+	const int fd = socket(p.send_family, p.protocol, 0), one = 1;
 
 	if (fd < 0)
 		error(1, errno, "failed to create send socket");
 
+	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)))
+		error(1, errno, "failed to set reuseaddr");
+
 	if (bind(fd, saddr, sockaddr_size()))
 		error(1, errno, "failed to bind send socket");
 
@@ -433,6 +437,21 @@ void enable_fastopen(void)
 	}
 }
 
+static struct rlimit rlim_old, rlim_new;
+
+static  __attribute__((constructor)) void main_ctor(void)
+{
+	getrlimit(RLIMIT_MEMLOCK, &rlim_old);
+	rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20);
+	rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20);
+	setrlimit(RLIMIT_MEMLOCK, &rlim_new);
+}
+
+static __attribute__((destructor)) void main_dtor(void)
+{
+	setrlimit(RLIMIT_MEMLOCK, &rlim_old);
+}
+
 int main(void)
 {
 	fprintf(stderr, "---- IPv4 UDP ----\n");

From 2fa56a494484f19e06bf4f3464b2155a92beafac Mon Sep 17 00:00:00 2001
From: John Allen <jallen@linux.vnet.ibm.com>
Date: Fri, 9 Feb 2018 13:19:46 -0600
Subject: [PATCH 477/676] ibmvnic: Remove skb->protocol checks in ibmvnic_xmit

Having these checks in ibmvnic_xmit causes problems with VLAN
tagging and balance-alb/tlb bonding modes. The restriction they
imposed can be removed.

Signed-off-by: John Allen <jallen@linux.vnet.ibm.com>
Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index ce127a72cda23..27447260215d1 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1419,10 +1419,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		hdrs += 2;
 	}
 	/* determine if l2/3/4 headers are sent to firmware */
-	if ((*hdrs >> 7) & 1 &&
-	    (skb->protocol == htons(ETH_P_IP) ||
-	     skb->protocol == htons(ETH_P_IPV6) ||
-	     skb->protocol == htons(ETH_P_ARP))) {
+	if ((*hdrs >> 7) & 1) {
 		build_hdr_descs_arr(tx_buff, &num_entries, *hdrs);
 		tx_crq.v1.n_crq_elem = num_entries;
 		tx_buff->indir_arr[0] = tx_crq;

From ef08e14a3832c88bb8d5ccbb88eab48642fc6aa9 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 9 Feb 2018 23:59:30 +0000
Subject: [PATCH 478/676] platform/x86: mlx-platform: Add support for new
 msn274x system type

It adds support for new Mellanox system types of basic class msn274x,
containing system MSN2740 (32x100GbE Ethernet switch with cost reduction)
and its derivatives. These are the Top of the Rack system, equipped with
Mellanox Small Form Factor carrier board and switch board with Mellanox
Spectrum device, which supports Ethernet switching with 32X100G ports line
rate of up to EDR speed.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 124 ++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index e87fe34eadf5b..3a13285d1aa87 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -94,6 +94,7 @@
 /* Hotplug devices adapter numbers */
 #define MLXPLAT_CPLD_NR_NONE			-1
 #define MLXPLAT_CPLD_PSU_DEFAULT_NR		10
+#define MLXPLAT_CPLD_PSU_MSNXXXX_NR		4
 #define MLXPLAT_CPLD_FAN1_DEFAULT_NR		11
 #define MLXPLAT_CPLD_FAN2_DEFAULT_NR		12
 #define MLXPLAT_CPLD_FAN3_DEFAULT_NR		13
@@ -335,6 +336,108 @@ struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn21xx_data = {
 	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
 };
 
+/* Platform hotplug msn274x system family data */
+static struct mlxreg_core_data mlxplat_mlxcpld_msn274x_psu_items_data[] = {
+	{
+		.label = "psu1",
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = BIT(0),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_psu[0],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+	{
+		.label = "psu2",
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = BIT(1),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_psu[1],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+};
+
+static struct mlxreg_core_data mlxplat_mlxcpld_default_ng_pwr_items_data[] = {
+	{
+		.label = "pwr1",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(0),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_pwr[0],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+	{
+		.label = "pwr2",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(1),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_pwr[1],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+};
+
+static struct mlxreg_core_data mlxplat_mlxcpld_msn274x_fan_items_data[] = {
+	{
+		.label = "fan1",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(0),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan2",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(1),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan3",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(2),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan4",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(3),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+};
+
+static struct mlxreg_core_item mlxplat_mlxcpld_msn274x_items[] = {
+	{
+		.data = mlxplat_mlxcpld_msn274x_psu_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = MLXPLAT_CPLD_PSU_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_msn274x_psu_items_data),
+		.inversed = 1,
+		.health = false,
+	},
+	{
+		.data = mlxplat_mlxcpld_default_ng_pwr_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = MLXPLAT_CPLD_PWR_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_default_ng_pwr_items_data),
+		.inversed = 0,
+		.health = false,
+	},
+	{
+		.data = mlxplat_mlxcpld_msn274x_fan_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = MLXPLAT_CPLD_FAN_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_msn274x_fan_items_data),
+		.inversed = 1,
+		.health = false,
+	},
+};
+
+static
+struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn274x_data = {
+	.items = mlxplat_mlxcpld_msn274x_items,
+	.counter = ARRAY_SIZE(mlxplat_mlxcpld_msn274x_items),
+	.cell = MLXPLAT_CPLD_LPC_REG_AGGR_OFFSET,
+	.mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+	.cell_low = MLXPLAT_CPLD_LPC_REG_AGGRLO_OFFSET,
+	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
+};
+
 static bool mlxplat_mlxcpld_writeable_reg(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
@@ -464,7 +567,28 @@ static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi)
 	return 1;
 };
 
+static int __init mlxplat_dmi_msn274x_matched(const struct dmi_system_id *dmi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+		mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
+		mlxplat_mux_data[i].n_values =
+				ARRAY_SIZE(mlxplat_msn21xx_channels);
+	}
+	mlxplat_hotplug = &mlxplat_mlxcpld_msn274x_data;
+
+	return 1;
+};
+
 static const struct dmi_system_id mlxplat_dmi_table[] __initconst = {
+	{
+		.callback = mlxplat_dmi_msn274x_matched,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MSN274"),
+		},
+	},
 	{
 		.callback = mlxplat_dmi_default_matched,
 		.matches = {

From a49a41482f61a48ff00f63f809ac0d802cb75424 Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 9 Feb 2018 23:59:31 +0000
Subject: [PATCH 479/676] platform/x86: mlx-platform: Add support for new
 msn201x system type

It adds support for new Mellanox system types of basic half unit size
class msn201x, containing system MSN2010 (18x10GbE plus 4x4x25GbE) half
and its derivatives. This is the Top of the Rack system, equipped with
Mellanox Small Form Factor carrier board and switch board with Mellanox
Spectrum device, which supports Ethernet switching with 32X100G ports line
rate of up to EDR speed.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 59 +++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index 3a13285d1aa87..e4251cfa7b5d6 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -438,6 +438,44 @@ struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn274x_data = {
 	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
 };
 
+/* Platform hotplug MSN201x system family data */
+static struct mlxreg_core_data mlxplat_mlxcpld_msn201x_pwr_items_data[] = {
+	{
+		.label = "pwr1",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(0),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "pwr2",
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = BIT(1),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+};
+
+static struct mlxreg_core_item mlxplat_mlxcpld_msn201x_items[] = {
+	{
+		.data = mlxplat_mlxcpld_msn201x_pwr_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_PWR_MASK_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = MLXPLAT_CPLD_PWR_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_msn201x_pwr_items_data),
+		.inversed = 0,
+		.health = false,
+	},
+};
+
+static
+struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn201x_data = {
+	.items = mlxplat_mlxcpld_msn21xx_items,
+	.counter = ARRAY_SIZE(mlxplat_mlxcpld_msn201x_items),
+	.cell = MLXPLAT_CPLD_LPC_REG_AGGR_OFFSET,
+	.mask = MLXPLAT_CPLD_AGGR_MASK_DEF,
+	.cell_low = MLXPLAT_CPLD_LPC_REG_AGGRLO_OFFSET,
+	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
+};
+
 static bool mlxplat_mlxcpld_writeable_reg(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
@@ -581,6 +619,20 @@ static int __init mlxplat_dmi_msn274x_matched(const struct dmi_system_id *dmi)
 	return 1;
 };
 
+static int __init mlxplat_dmi_msn201x_matched(const struct dmi_system_id *dmi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+		mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
+		mlxplat_mux_data[i].n_values =
+				ARRAY_SIZE(mlxplat_msn21xx_channels);
+	}
+	mlxplat_hotplug = &mlxplat_mlxcpld_msn201x_data;
+
+	return 1;
+};
+
 static const struct dmi_system_id mlxplat_dmi_table[] __initconst = {
 	{
 		.callback = mlxplat_dmi_msn274x_matched,
@@ -624,6 +676,13 @@ static const struct dmi_system_id mlxplat_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"),
 		},
 	},
+	{
+		.callback = mlxplat_dmi_msn201x_matched,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MSN201"),
+		},
+	},
 	{ }
 };
 

From 1bd42d94ccab4eab5dc9dc9d303a687a61cde9bd Mon Sep 17 00:00:00 2001
From: Vadim Pasternak <vadimp@mellanox.com>
Date: Fri, 9 Feb 2018 23:59:32 +0000
Subject: [PATCH 480/676] platform/x86: mlx-platform: Add support for new 200G
 IB and Ethernet systems

It adds support for new Mellanox system types of basic classes qmb7, sn34,
sn37, containing systems QMB700 (40x200GbE InfiniBand switch), SN3700
(32x200GbE and 16x400GbE Ethernet switch) and SN3410 (6x400GbE plus
48x50GbE Ethernet switch). These are the Top of the Rack systems, equipped
with Mellanox COM-Express carrier board and switch board with Mellanox
Quantum device, which supports InfiniBand switching with 40X200G ports and
line rate of up to HDR speed or with Mellanox Spectrum-2 device, which
supports Ethernet switching with 32X200G ports line rate of up to HDR
speed.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
---
 drivers/platform/x86/mlx-platform.c | 142 ++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/drivers/platform/x86/mlx-platform.c b/drivers/platform/x86/mlx-platform.c
index e4251cfa7b5d6..454e14f022855 100644
--- a/drivers/platform/x86/mlx-platform.c
+++ b/drivers/platform/x86/mlx-platform.c
@@ -83,6 +83,7 @@
 #define MLXPLAT_CPLD_PSU_MASK		GENMASK(1, 0)
 #define MLXPLAT_CPLD_PWR_MASK		GENMASK(1, 0)
 #define MLXPLAT_CPLD_FAN_MASK		GENMASK(3, 0)
+#define MLXPLAT_CPLD_FAN_NG_MASK	GENMASK(5, 0)
 
 /* Start channel numbers */
 #define MLXPLAT_CPLD_CH1			2
@@ -170,6 +171,15 @@ static struct i2c_board_info mlxplat_mlxcpld_psu[] = {
 	},
 };
 
+static struct i2c_board_info mlxplat_mlxcpld_ng_psu[] = {
+	{
+		I2C_BOARD_INFO("24c32", 0x51),
+	},
+	{
+		I2C_BOARD_INFO("24c32", 0x50),
+	},
+};
+
 static struct i2c_board_info mlxplat_mlxcpld_pwr[] = {
 	{
 		I2C_BOARD_INFO("dps460", 0x59),
@@ -476,6 +486,103 @@ struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_msn201x_data = {
 	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
 };
 
+/* Platform hotplug next generation system family data */
+static struct mlxreg_core_data mlxplat_mlxcpld_default_ng_psu_items_data[] = {
+	{
+		.label = "psu1",
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = BIT(0),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_ng_psu[0],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+	{
+		.label = "psu2",
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = BIT(1),
+		.hpdev.brdinfo = &mlxplat_mlxcpld_ng_psu[1],
+		.hpdev.nr = MLXPLAT_CPLD_PSU_MSNXXXX_NR,
+	},
+};
+
+static struct mlxreg_core_data mlxplat_mlxcpld_default_ng_fan_items_data[] = {
+	{
+		.label = "fan1",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(0),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan2",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(1),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan3",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(2),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan4",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(3),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan5",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(4),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+	{
+		.label = "fan6",
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = BIT(5),
+		.hpdev.nr = MLXPLAT_CPLD_NR_NONE,
+	},
+};
+
+static struct mlxreg_core_item mlxplat_mlxcpld_default_ng_items[] = {
+	{
+		.data = mlxplat_mlxcpld_default_ng_psu_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_PSU_OFFSET,
+		.mask = MLXPLAT_CPLD_PSU_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_default_ng_psu_items_data),
+		.inversed = 1,
+		.health = false,
+	},
+	{
+		.data = mlxplat_mlxcpld_default_ng_pwr_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_PWR_OFFSET,
+		.mask = MLXPLAT_CPLD_PWR_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_default_ng_pwr_items_data),
+		.inversed = 0,
+		.health = false,
+	},
+	{
+		.data = mlxplat_mlxcpld_default_ng_fan_items_data,
+		.aggr_mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+		.reg = MLXPLAT_CPLD_LPC_REG_FAN_OFFSET,
+		.mask = MLXPLAT_CPLD_FAN_NG_MASK,
+		.count = ARRAY_SIZE(mlxplat_mlxcpld_default_ng_fan_items_data),
+		.inversed = 1,
+		.health = false,
+	},
+};
+
+static
+struct mlxreg_core_hotplug_platform_data mlxplat_mlxcpld_default_ng_data = {
+	.items = mlxplat_mlxcpld_default_ng_items,
+	.counter = ARRAY_SIZE(mlxplat_mlxcpld_default_ng_items),
+	.cell = MLXPLAT_CPLD_LPC_REG_AGGR_OFFSET,
+	.mask = MLXPLAT_CPLD_AGGR_MASK_NG_DEF,
+	.cell_low = MLXPLAT_CPLD_LPC_REG_AGGRLO_OFFSET,
+	.mask_low = MLXPLAT_CPLD_LOW_AGGR_MASK_LOW,
+};
+
 static bool mlxplat_mlxcpld_writeable_reg(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
@@ -633,6 +740,20 @@ static int __init mlxplat_dmi_msn201x_matched(const struct dmi_system_id *dmi)
 	return 1;
 };
 
+static int __init mlxplat_dmi_qmb7xx_matched(const struct dmi_system_id *dmi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+		mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
+		mlxplat_mux_data[i].n_values =
+				ARRAY_SIZE(mlxplat_msn21xx_channels);
+	}
+	mlxplat_hotplug = &mlxplat_mlxcpld_default_ng_data;
+
+	return 1;
+};
+
 static const struct dmi_system_id mlxplat_dmi_table[] __initconst = {
 	{
 		.callback = mlxplat_dmi_msn274x_matched,
@@ -683,6 +804,27 @@ static const struct dmi_system_id mlxplat_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "MSN201"),
 		},
 	},
+	{
+		.callback = mlxplat_dmi_qmb7xx_matched,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "QMB7"),
+		},
+	},
+	{
+		.callback = mlxplat_dmi_qmb7xx_matched,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "SN37"),
+		},
+	},
+	{
+		.callback = mlxplat_dmi_qmb7xx_matched,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "SN34"),
+		},
+	},
 	{ }
 };
 

From d717f24d8c68081caae2374cf5ea6c4e62c490fc Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 9 Feb 2018 01:19:07 +0900
Subject: [PATCH 481/676] kconfig: add xrealloc() helper

We already have xmalloc(), xcalloc().  Add xrealloc() as well
to save tedious error handling.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/kconfig/confdata.c  |  2 +-
 scripts/kconfig/lkc.h       |  1 +
 scripts/kconfig/nconf.gui.c |  3 ++-
 scripts/kconfig/symbol.c    |  2 +-
 scripts/kconfig/util.c      | 11 ++++++++++-
 scripts/kconfig/zconf.l     |  2 +-
 6 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index f7927391de301..5c12dc91ef348 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -201,7 +201,7 @@ static int add_byte(int c, char **lineptr, size_t slen, size_t *n)
 	if (new_size > *n) {
 		new_size += LINE_GROWTH - 1;
 		new_size *= 2;
-		nline = realloc(*lineptr, new_size);
+		nline = xrealloc(*lineptr, new_size);
 		if (!nline)
 			return -1;
 
diff --git a/scripts/kconfig/lkc.h b/scripts/kconfig/lkc.h
index 16cb62b92650d..4e23febbe4b28 100644
--- a/scripts/kconfig/lkc.h
+++ b/scripts/kconfig/lkc.h
@@ -114,6 +114,7 @@ struct file *file_lookup(const char *name);
 int file_write_dep(const char *name);
 void *xmalloc(size_t size);
 void *xcalloc(size_t nmemb, size_t size);
+void *xrealloc(void *p, size_t size);
 
 struct gstr {
 	size_t len;
diff --git a/scripts/kconfig/nconf.gui.c b/scripts/kconfig/nconf.gui.c
index a64b1c31253e1..88874acfda367 100644
--- a/scripts/kconfig/nconf.gui.c
+++ b/scripts/kconfig/nconf.gui.c
@@ -6,6 +6,7 @@
  *
  */
 #include "nconf.h"
+#include "lkc.h"
 
 /* a list of all the different widgets we use */
 attributes_t attributes[ATTR_MAX+1] = {0};
@@ -374,7 +375,7 @@ int dialog_inputbox(WINDOW *main_window,
 
 	if (strlen(init)+1 > *result_len) {
 		*result_len = strlen(init)+1;
-		*resultp = result = realloc(result, *result_len);
+		*resultp = result = xrealloc(result, *result_len);
 	}
 
 	/* find the widest line of msg: */
diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index c4409ee7fee20..60a76f958f334 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -936,7 +936,7 @@ const char *sym_expand_string_value(const char *in)
 		newlen = strlen(res) + strlen(symval) + strlen(src) + 1;
 		if (newlen > reslen) {
 			reslen = newlen;
-			res = realloc(res, reslen);
+			res = xrealloc(res, reslen);
 		}
 
 		strcat(res, symval);
diff --git a/scripts/kconfig/util.c b/scripts/kconfig/util.c
index 0e76042473ccd..138894ef49ea1 100644
--- a/scripts/kconfig/util.c
+++ b/scripts/kconfig/util.c
@@ -104,7 +104,7 @@ void str_append(struct gstr *gs, const char *s)
 	if (s) {
 		l = strlen(gs->s) + strlen(s) + 1;
 		if (l > gs->len) {
-			gs->s   = realloc(gs->s, l);
+			gs->s = xrealloc(gs->s, l);
 			gs->len = l;
 		}
 		strcat(gs->s, s);
@@ -145,3 +145,12 @@ void *xcalloc(size_t nmemb, size_t size)
 	fprintf(stderr, "Out of memory.\n");
 	exit(1);
 }
+
+void *xrealloc(void *p, size_t size)
+{
+	p = realloc(p, size);
+	if (p)
+		return p;
+	fprintf(stderr, "Out of memory.\n");
+	exit(1);
+}
diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l
index 0ba4900050c13..02de6fe302a9a 100644
--- a/scripts/kconfig/zconf.l
+++ b/scripts/kconfig/zconf.l
@@ -52,7 +52,7 @@ static void append_string(const char *str, int size)
 	if (new_size > text_asize) {
 		new_size += START_STRSIZE - 1;
 		new_size &= -START_STRSIZE;
-		text = realloc(text, new_size);
+		text = xrealloc(text, new_size);
 		text_asize = new_size;
 	}
 	memcpy(text + text_size, str, size);

From 523ca58b7db2e30e3c185a7927dd80a30c1bc743 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 9 Feb 2018 01:19:08 +0900
Subject: [PATCH 482/676] kconfig: remove const qualifier from
 sym_expand_string_value()

This function returns realloc'ed memory, so the returned pointer
must be passed to free() when done.  So, 'const' qualifier is odd.
It is allowed to modify the expanded string.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 scripts/kconfig/lkc_proto.h | 2 +-
 scripts/kconfig/symbol.c    | 2 +-
 scripts/kconfig/util.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/kconfig/lkc_proto.h b/scripts/kconfig/lkc_proto.h
index 5d86e2dfae59d..9dc8abfb1dc3c 100644
--- a/scripts/kconfig/lkc_proto.h
+++ b/scripts/kconfig/lkc_proto.h
@@ -31,7 +31,7 @@ extern struct symbol * symbol_hash[SYMBOL_HASHSIZE];
 
 struct symbol * sym_lookup(const char *name, int flags);
 struct symbol * sym_find(const char *name);
-const char * sym_expand_string_value(const char *in);
+char *sym_expand_string_value(const char *in);
 const char * sym_escape_string_value(const char *in);
 struct symbol ** sym_re_search(const char *pattern);
 const char * sym_type_name(enum symbol_type type);
diff --git a/scripts/kconfig/symbol.c b/scripts/kconfig/symbol.c
index 60a76f958f334..cca9663be5ddd 100644
--- a/scripts/kconfig/symbol.c
+++ b/scripts/kconfig/symbol.c
@@ -899,7 +899,7 @@ struct symbol *sym_find(const char *name)
  * name to be expanded shall be prefixed by a '$'. Unknown symbol expands to
  * the empty string.
  */
-const char *sym_expand_string_value(const char *in)
+char *sym_expand_string_value(const char *in)
 {
 	const char *src;
 	char *res;
diff --git a/scripts/kconfig/util.c b/scripts/kconfig/util.c
index 138894ef49ea1..b98a79e30e04a 100644
--- a/scripts/kconfig/util.c
+++ b/scripts/kconfig/util.c
@@ -14,11 +14,11 @@
 struct file *file_lookup(const char *name)
 {
 	struct file *file;
-	const char *file_name = sym_expand_string_value(name);
+	char *file_name = sym_expand_string_value(name);
 
 	for (file = file_list; file; file = file->next) {
 		if (!strcmp(name, file->name)) {
-			free((void *)file_name);
+			free(file_name);
 			return file;
 		}
 	}

From 7a501609c2cb73381e925827c504a4c2c2cb0817 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sat, 10 Feb 2018 01:35:16 +0000
Subject: [PATCH 483/676] mconsole_proc(): don't mess with file->f_pos

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/um/drivers/mconsole_kern.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index c4d162a94be9d..d5f9a2d1da1ba 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -130,6 +130,7 @@ void mconsole_proc(struct mc_request *req)
 	struct file *file;
 	int first_chunk = 1;
 	char *ptr = req->request.data;
+	loff_t pos = 0;
 
 	ptr += strlen("proc");
 	ptr = skip_spaces(ptr);
@@ -148,7 +149,7 @@ void mconsole_proc(struct mc_request *req)
 	}
 
 	do {
-		len = kernel_read(file, buf, PAGE_SIZE - 1, &file->f_pos);
+		len = kernel_read(file, buf, PAGE_SIZE - 1, &pos);
 		if (len < 0) {
 			mconsole_reply(req, "Read of file failed", 1, 0);
 			goto out_free;

From 14b1fcc62043729d12e8ae00f8297ab2ffe9fa91 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Fri, 9 Feb 2018 09:06:38 -0800
Subject: [PATCH 484/676] x86/mm/pti: Fix PTI comment in entry_SYSCALL_64()

The comment is confusing since the path is taken when
CONFIG_PAGE_TABLE_ISOLATION=y is disabled (while the comment says it is not
taken).

Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: nadav.amit@gmail.com
Link: http://lkml.kernel.org/r/20180209170638.15161-1-namit@vmware.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9e48002b953b1..932a445febee7 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -213,7 +213,7 @@ ENTRY(entry_SYSCALL_64)
 
 	swapgs
 	/*
-	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+	 * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
 	 * is not required to switch CR3.
 	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)

From c591c2e36ccc9a08f265841d2fd68e35327ab3c4 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 9 Feb 2018 17:23:58 +1100
Subject: [PATCH 485/676] powerpc/pci: Fix broken INTx configuration via OF

59f47eff03a0 ("powerpc/pci: Use of_irq_parse_and_map_pci() helper")
replaced of_irq_parse_pci() + irq_create_of_mapping() with
of_irq_parse_and_map_pci(), but neglected to capture the virq
returned by irq_create_of_mapping(), so virq remained zero, which
caused INTx configuration to fail.

Save the virq value returned by of_irq_parse_and_map_pci() and correct
the virq declaration to match the of_irq_parse_and_map_pci() signature.

Fixes: 59f47eff03a0 "powerpc/pci: Use of_irq_parse_and_map_pci() helper"
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/powerpc/kernel/pci-common.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 344af823c3c46..4e884db7b213f 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -339,7 +339,7 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node)
  */
 static int pci_read_irq_line(struct pci_dev *pci_dev)
 {
-	unsigned int virq = 0;
+	int virq;
 
 	pr_debug("PCI: Try to map irq for %s...\n", pci_name(pci_dev));
 
@@ -347,7 +347,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 	memset(&oirq, 0xff, sizeof(oirq));
 #endif
 	/* Try to get a mapping from the device-tree */
-	if (!of_irq_parse_and_map_pci(pci_dev, 0, 0)) {
+	virq = of_irq_parse_and_map_pci(pci_dev, 0, 0);
+	if (virq <= 0) {
 		u8 line, pin;
 
 		/* If that fails, lets fallback to what is in the config

From 3efd6e8ebe19f0774c82de582849539b60cc4d97 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 6 Feb 2018 06:48:29 -0800
Subject: [PATCH 486/676] nvme_fc: correct abort race condition on resets

During reset handling, there is live io completing while the reset
is taking place. The reset path attempts to abort all outstanding io,
counting the number of ios that were reset. It then waits for those
ios to be reclaimed from the lldd before continuing.

The transport's logic on io state and flag setting was poor, allowing
ios to complete simultaneous to the abort request. The completed ios
were counted, but as the completion had already occurred, the
completion never reduced the count. As the count never zeros, the
reset/delete never completes.

Tighten it up by unconditionally changing the op state to completed
when the io done handler is called.  The reset/abort path now changes
the op state to aborted, but the abort only continues if the op
state was live priviously. If complete, the abort is backed out.
Thus proper counting of io aborts and their completions is working
again.

Also removed the TERMIO state on the op as it's redundant with the
op's aborted state.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fc.c | 98 +++++++++++-------------------------------
 1 file changed, 26 insertions(+), 72 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e2df22d56b2ab..4673882ce1522 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1512,13 +1512,19 @@ nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq,
 static int
 __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
 {
-	int state;
+	unsigned long flags;
+	int opstate;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
+	if (opstate != FCPOP_STATE_ACTIVE)
+		atomic_set(&op->state, opstate);
+	else if (ctrl->flags & FCCTRL_TERMIO)
+		ctrl->iocnt++;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
 
-	state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED);
-	if (state != FCPOP_STATE_ACTIVE) {
-		atomic_set(&op->state, state);
+	if (opstate != FCPOP_STATE_ACTIVE)
 		return -ECANCELED;
-	}
 
 	ctrl->lport->ops->fcp_abort(&ctrl->lport->localport,
 					&ctrl->rport->remoteport,
@@ -1532,52 +1538,23 @@ static void
 nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 {
 	struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops;
-	unsigned long flags;
-	int i, ret;
-
-	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
-		if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE)
-			continue;
-
-		spin_lock_irqsave(&ctrl->lock, flags);
-		if (ctrl->flags & FCCTRL_TERMIO) {
-			ctrl->iocnt++;
-			aen_op->flags |= FCOP_FLAGS_TERMIO;
-		}
-		spin_unlock_irqrestore(&ctrl->lock, flags);
-
-		ret = __nvme_fc_abort_op(ctrl, aen_op);
-		if (ret) {
-			/*
-			 * if __nvme_fc_abort_op failed the io wasn't
-			 * active. Thus this call path is running in
-			 * parallel to the io complete. Treat as non-error.
-			 */
+	int i;
 
-			/* back out the flags/counters */
-			spin_lock_irqsave(&ctrl->lock, flags);
-			if (ctrl->flags & FCCTRL_TERMIO)
-				ctrl->iocnt--;
-			aen_op->flags &= ~FCOP_FLAGS_TERMIO;
-			spin_unlock_irqrestore(&ctrl->lock, flags);
-			return;
-		}
-	}
+	for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++)
+		__nvme_fc_abort_op(ctrl, aen_op);
 }
 
 static inline int
 __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
-		struct nvme_fc_fcp_op *op)
+		struct nvme_fc_fcp_op *op, int opstate)
 {
 	unsigned long flags;
 	bool complete_rq = false;
 
 	spin_lock_irqsave(&ctrl->lock, flags);
-	if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) {
-		if (ctrl->flags & FCCTRL_TERMIO) {
-			if (!--ctrl->iocnt)
-				wake_up(&ctrl->ioabort_wait);
-		}
+	if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) {
+		if (!--ctrl->iocnt)
+			wake_up(&ctrl->ioabort_wait);
 	}
 	if (op->flags & FCOP_FLAGS_RELEASED)
 		complete_rq = true;
@@ -1601,6 +1578,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
 	union nvme_result result;
 	bool terminate_assoc = true;
+	int opstate;
 
 	/*
 	 * WARNING:
@@ -1639,11 +1617,12 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	 * association to be terminated.
 	 */
 
+	opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
+
 	fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma,
 				sizeof(op->rsp_iu), DMA_FROM_DEVICE);
 
-	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED ||
-			op->flags & FCOP_FLAGS_TERMIO)
+	if (opstate == FCPOP_STATE_ABORTED)
 		status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
 	else if (freq->status)
 		status = cpu_to_le16(NVME_SC_INTERNAL << 1);
@@ -1708,7 +1687,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 done:
 	if (op->flags & FCOP_FLAGS_AEN) {
 		nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
-		__nvme_fc_fcpop_chk_teardowns(ctrl, op);
+		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
 		atomic_set(&op->state, FCPOP_STATE_IDLE);
 		op->flags = FCOP_FLAGS_AEN;	/* clear other flags */
 		nvme_fc_ctrl_put(ctrl);
@@ -1725,7 +1704,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	     ctrl->ctrl.state == NVME_CTRL_CONNECTING))
 		status |= cpu_to_le16(NVME_SC_DNR << 1);
 
-	if (__nvme_fc_fcpop_chk_teardowns(ctrl, op))
+	if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
 		__nvme_fc_final_op_cleanup(rq);
 	else
 		nvme_end_request(rq, status, result);
@@ -2421,8 +2400,7 @@ __nvme_fc_final_op_cleanup(struct request *rq)
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 
 	atomic_set(&op->state, FCPOP_STATE_IDLE);
-	op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED |
-			FCOP_FLAGS_COMPLETE);
+	op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE);
 
 	nvme_fc_unmap_data(ctrl, rq, op);
 	nvme_complete_rq(rq);
@@ -2476,35 +2454,11 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 	struct nvme_ctrl *nctrl = data;
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
-	unsigned long flags;
-	int status;
 
 	if (!blk_mq_request_started(req))
 		return;
 
-	spin_lock_irqsave(&ctrl->lock, flags);
-	if (ctrl->flags & FCCTRL_TERMIO) {
-		ctrl->iocnt++;
-		op->flags |= FCOP_FLAGS_TERMIO;
-	}
-	spin_unlock_irqrestore(&ctrl->lock, flags);
-
-	status = __nvme_fc_abort_op(ctrl, op);
-	if (status) {
-		/*
-		 * if __nvme_fc_abort_op failed the io wasn't
-		 * active. Thus this call path is running in
-		 * parallel to the io complete. Treat as non-error.
-		 */
-
-		/* back out the flags/counters */
-		spin_lock_irqsave(&ctrl->lock, flags);
-		if (ctrl->flags & FCCTRL_TERMIO)
-			ctrl->iocnt--;
-		op->flags &= ~FCOP_FLAGS_TERMIO;
-		spin_unlock_irqrestore(&ctrl->lock, flags);
-		return;
-	}
+	__nvme_fc_abort_op(ctrl, op);
 }
 
 

From c3aedd225f8bcc3b3e61df074bc045b80542b38a Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 6 Feb 2018 06:48:30 -0800
Subject: [PATCH 487/676] nvme_fc: cleanup io completion

There was some old cold that dealt with complete_rq being called
prior to the lldd returning the io completion. This is garbage code.
The complete_rq routine was being called after eh_timeouts were
called and it was due to eh_timeouts not being handled properly.
The timeouts were fixed in prior patches so that in general, a
timeout will initiate an abort and the reset timer restarted as
the abort operation will take care of completing things. Given the
reset timer restarted, the erroneous complete_rq calls were eliminated.

So remove the work that was synchronizing complete_rq with io
completion.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/fc.c | 63 ++++++++----------------------------------
 1 file changed, 12 insertions(+), 51 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 4673882ce1522..7f51f8414b972 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -55,9 +55,7 @@ struct nvme_fc_queue {
 
 enum nvme_fcop_flags {
 	FCOP_FLAGS_TERMIO	= (1 << 0),
-	FCOP_FLAGS_RELEASED	= (1 << 1),
-	FCOP_FLAGS_COMPLETE	= (1 << 2),
-	FCOP_FLAGS_AEN		= (1 << 3),
+	FCOP_FLAGS_AEN		= (1 << 1),
 };
 
 struct nvmefc_ls_req_op {
@@ -1470,7 +1468,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
 
 /* *********************** NVME Ctrl Routines **************************** */
 
-static void __nvme_fc_final_op_cleanup(struct request *rq);
 static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
 
 static int
@@ -1544,25 +1541,20 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 		__nvme_fc_abort_op(ctrl, aen_op);
 }
 
-static inline int
+static inline void
 __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
 		struct nvme_fc_fcp_op *op, int opstate)
 {
 	unsigned long flags;
-	bool complete_rq = false;
 
-	spin_lock_irqsave(&ctrl->lock, flags);
-	if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) {
-		if (!--ctrl->iocnt)
-			wake_up(&ctrl->ioabort_wait);
+	if (opstate == FCPOP_STATE_ABORTED) {
+		spin_lock_irqsave(&ctrl->lock, flags);
+		if (ctrl->flags & FCCTRL_TERMIO) {
+			if (!--ctrl->iocnt)
+				wake_up(&ctrl->ioabort_wait);
+		}
+		spin_unlock_irqrestore(&ctrl->lock, flags);
 	}
-	if (op->flags & FCOP_FLAGS_RELEASED)
-		complete_rq = true;
-	else
-		op->flags |= FCOP_FLAGS_COMPLETE;
-	spin_unlock_irqrestore(&ctrl->lock, flags);
-
-	return complete_rq;
 }
 
 static void
@@ -1704,10 +1696,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	     ctrl->ctrl.state == NVME_CTRL_CONNECTING))
 		status |= cpu_to_le16(NVME_SC_DNR << 1);
 
-	if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
-		__nvme_fc_final_op_cleanup(rq);
-	else
-		nvme_end_request(rq, status, result);
+	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+	nvme_end_request(rq, status, result);
 
 check_error:
 	if (terminate_assoc)
@@ -2394,45 +2384,16 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg)
 }
 
 static void
-__nvme_fc_final_op_cleanup(struct request *rq)
+nvme_fc_complete_rq(struct request *rq)
 {
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 
 	atomic_set(&op->state, FCPOP_STATE_IDLE);
-	op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE);
 
 	nvme_fc_unmap_data(ctrl, rq, op);
 	nvme_complete_rq(rq);
 	nvme_fc_ctrl_put(ctrl);
-
-}
-
-static void
-nvme_fc_complete_rq(struct request *rq)
-{
-	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
-	struct nvme_fc_ctrl *ctrl = op->ctrl;
-	unsigned long flags;
-	bool completed = false;
-
-	/*
-	 * the core layer, on controller resets after calling
-	 * nvme_shutdown_ctrl(), calls complete_rq without our
-	 * calling blk_mq_complete_request(), thus there may still
-	 * be live i/o outstanding with the LLDD. Means transport has
-	 * to track complete calls vs fcpio_done calls to know what
-	 * path to take on completes and dones.
-	 */
-	spin_lock_irqsave(&ctrl->lock, flags);
-	if (op->flags & FCOP_FLAGS_COMPLETE)
-		completed = true;
-	else
-		op->flags |= FCOP_FLAGS_RELEASED;
-	spin_unlock_irqrestore(&ctrl->lock, flags);
-
-	if (completed)
-		__nvme_fc_final_op_cleanup(rq);
 }
 
 /*

From 1751342095f0d2b36fa8114d8e12c5688c455ac4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 10 Feb 2018 23:39:22 +0000
Subject: [PATCH 488/676] x86/speculation: Update Speculation Control microcode
 blacklist

Intel have retroactively blessed the 0xc2 microcode on Skylake mobile
and desktop parts, and the Gemini Lake 0x22 microcode is apparently fine
too. We blacklisted the latter purely because it was present with all
the other problematic ones in the 2018-01-08 release, but now it's
explicitly listed as OK.

We still list 0x84 for the various Kaby Lake / Coffee Lake parts, as
that appeared in one version of the blacklist and then reverted to
0x80 again. We can change it if 0x84 is actually announced to be safe.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: jmattson@google.com
Cc: karahmed@amazon.de
Cc: kvm@vger.kernel.org
Cc: pbonzini@redhat.com
Cc: rkrcmar@redhat.com
Cc: sironi@amazon.de
Link: http://lkml.kernel.org/r/1518305967-31356-2-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 319bf989fad1e..f73b8148dd551 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -123,8 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] = {
 	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x09,	0x84 },
 	{ INTEL_FAM6_SKYLAKE_X,		0x03,	0x0100013e },
 	{ INTEL_FAM6_SKYLAKE_X,		0x04,	0x0200003c },
-	{ INTEL_FAM6_SKYLAKE_MOBILE,	0x03,	0xc2 },
-	{ INTEL_FAM6_SKYLAKE_DESKTOP,	0x03,	0xc2 },
 	{ INTEL_FAM6_BROADWELL_CORE,	0x04,	0x28 },
 	{ INTEL_FAM6_BROADWELL_GT3E,	0x01,	0x1b },
 	{ INTEL_FAM6_BROADWELL_XEON_D,	0x02,	0x14 },
@@ -136,8 +134,6 @@ static const struct sku_microcode spectre_bad_microcodes[] = {
 	{ INTEL_FAM6_HASWELL_X,		0x02,	0x3b },
 	{ INTEL_FAM6_HASWELL_X,		0x04,	0x10 },
 	{ INTEL_FAM6_IVYBRIDGE_X,	0x04,	0x42a },
-	/* Updated in the 20180108 release; blacklist until we know otherwise */
-	{ INTEL_FAM6_ATOM_GEMINI_LAKE,	0x01,	0x22 },
 	/* Observed in the wild */
 	{ INTEL_FAM6_SANDYBRIDGE_X,	0x06,	0x61b },
 	{ INTEL_FAM6_SANDYBRIDGE_X,	0x07,	0x712 },

From c80c5ec1b2fa8d3675fc2a6807a64771ea156698 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 10 Feb 2018 15:53:14 +0100
Subject: [PATCH 489/676] x86/MCE: Fix build warning introduced by "x86: do not
 use print_symbol()"

The following commit:

  7b6061627eb8 ("x86: do not use print_symbol()")

... introduced a new build warning on 32-bit x86:

  arch/x86/kernel/cpu/mcheck/mce.c:237:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
      pr_cont("{%pS}", (void *)m->ip);
                       ^

Fix the type mismatch between the 'void *' expected by %pS and the mce->ip
field which is u64 by casting to long.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-kernel@vger.kernel.org
Fixes: 7b6061627eb8 ("x86: do not use print_symbol()")
Link: http://lkml.kernel.org/r/20180210145314.22174-1-bp@alien8.de
[ Cleaned up the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3a8e88a611ebf..75f405ac085c5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -234,7 +234,7 @@ static void __print_mce(struct mce *m)
 			m->cs, m->ip);
 
 		if (m->cs == __KERNEL_CS)
-			pr_cont("{%pS}", (void *)m->ip);
+			pr_cont("{%pS}", (void *)(unsigned long)m->ip);
 		pr_cont("\n");
 	}
 

From a0d0bb4deba831085d3eeb32d39fe73713ce6eb2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 9 Feb 2018 16:51:03 -0800
Subject: [PATCH 490/676] x86/Kconfig: Simplify NR_CPUS config

Clean up and simplify the X86 NR_CPUS Kconfig symbol/option by
introducing RANGE_BEGIN_CPUS, RANGE_END_CPUS, and DEF_CONFIG_CPUS.
Then combine some default values when their conditionals can be
reduced.

Also move the X86_BIGSMP kconfig option inside an "if X86_32"/"endif"
config block and drop its explicit "depends on X86_32".

Combine the max. 8192 cases of RANGE_END_CPUS (X86_64 only).
Split RANGE_END_CPUS and DEF_CONFIG_CPUS into separate cases for
X86_32 and X86_64.

Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0b833246-ed4b-e451-c426-c4464725be92@infradead.org
Link: lkml.kernel.org/r/CA+55aFzOd3j6ZUSkEwTdk85qtt1JywOtm3ZAb-qAvt8_hJ6D4A@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 57 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 63bf349b2b24a..9d921b78b1453 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -423,12 +423,6 @@ config X86_MPPARSE
 	  For old smp systems that do not have proper acpi support. Newer systems
 	  (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
 
-config X86_BIGSMP
-	bool "Support for big SMP systems with more than 8 CPUs"
-	depends on X86_32 && SMP
-	---help---
-	  This option is needed for the systems that have more than 8 CPUs
-
 config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
@@ -460,6 +454,12 @@ config INTEL_RDT
 	  Say N if unsure.
 
 if X86_32
+config X86_BIGSMP
+	bool "Support for big SMP systems with more than 8 CPUs"
+	depends on SMP
+	---help---
+	  This option is needed for the systems that have more than 8 CPUs
+
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
 	default y
@@ -949,17 +949,44 @@ config MAXSMP
 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
 
+config RANGE_END_CPUS
+	int
+	depends on X86_32
+	default 8 if SMP && !X86_BIGSMP
+	default 64 if SMP && X86_BIGSMP
+	default 1 if !SMP
+
+config RANGE_END_CPUS
+	int
+	depends on X86_64
+	default 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
+	default 8192 if SMP && (MAXSMP || CPUMASK_OFFSTACK)
+	default 1 if !SMP
+
+config RANGE_BEGIN_CPUS
+	int
+	default 1 if !SMP
+	default RANGE_END_CPUS if MAXSMP
+	default 2
+
+config DEF_CONFIG_CPUS
+	int
+	depends on X86_32
+	default 1 if !SMP
+	default 32 if X86_BIGSMP
+	default 8 if SMP
+
+config DEF_CONFIG_CPUS
+	int
+	depends on X86_64
+	default 1 if !SMP
+	default 8192 if MAXSMP
+	default 64 if SMP
+
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
-	range 2 8 if SMP && X86_32 && !X86_BIGSMP
-	range 2 64 if SMP && X86_32 && X86_BIGSMP
-	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
-	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
-	default "1" if !SMP
-	default "8192" if MAXSMP
-	default "32" if SMP && X86_BIGSMP
-	default "8" if SMP && X86_32
-	default "64" if SMP
+	range RANGE_BEGIN_CPUS RANGE_END_CPUS
+	default DEF_CONFIG_CPUS
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum

From aec6487e994d2f625197970a56a4aac40c2c7547 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 10 Feb 2018 12:36:29 +0100
Subject: [PATCH 491/676] x86/Kconfig: Further simplify the NR_CPUS config

Clean up various aspects of the x86 CONFIG_NR_CPUS configuration switches:

- Rename the three CONFIG_NR_CPUS related variables to create a common
  namespace for them:

    RANGE_BEGIN_CPUS => NR_CPUS_RANGE_BEGIN
    RANGE_END_CPUS   => NR_CPUS_RANGE_END
    DEF_CONFIG_CPUS  => NR_CPUS_DEFAULT

- Align them vertically, such as:

    config NR_CPUS_RANGE_END
            int
            depends on X86_64
            default 8192 if  SMP && ( MAXSMP ||  CPUMASK_OFFSTACK)
            default  512 if  SMP && (!MAXSMP && !CPUMASK_OFFSTACK)
            default    1 if !SMP

- Update help text, add more comments.

Test results:

 # i386 allnoconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=1
 CONFIG_NR_CPUS_RANGE_END=1
 CONFIG_NR_CPUS_DEFAULT=1
 CONFIG_NR_CPUS=1

 # i386 defconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=2
 CONFIG_NR_CPUS_RANGE_END=8
 CONFIG_NR_CPUS_DEFAULT=8
 CONFIG_NR_CPUS=8

 # i386 allyesconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=2
 CONFIG_NR_CPUS_RANGE_END=64
 CONFIG_NR_CPUS_DEFAULT=32
 CONFIG_NR_CPUS=32

 # x86_64 allnoconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=1
 CONFIG_NR_CPUS_RANGE_END=1
 CONFIG_NR_CPUS_DEFAULT=1
 CONFIG_NR_CPUS=1

 # x86_64 defconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=2
 CONFIG_NR_CPUS_RANGE_END=512
 CONFIG_NR_CPUS_DEFAULT=64
 CONFIG_NR_CPUS=64

 # x86_64 allyesconfig:
 CONFIG_NR_CPUS_RANGE_BEGIN=8192
 CONFIG_NR_CPUS_RANGE_END=8192
 CONFIG_NR_CPUS_DEFAULT=8192
 CONFIG_NR_CPUS=8192

Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180210113629.jcv6su3r4suuno63@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 66 +++++++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9d921b78b1453..a528c14d45a52 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -949,52 +949,66 @@ config MAXSMP
 	  Enable maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
 
-config RANGE_END_CPUS
+#
+# The maximum number of CPUs supported:
+#
+# The main config value is NR_CPUS, which defaults to NR_CPUS_DEFAULT,
+# and which can be configured interactively in the
+# [NR_CPUS_RANGE_BEGIN ... NR_CPUS_RANGE_END] range.
+#
+# The ranges are different on 32-bit and 64-bit kernels, depending on
+# hardware capabilities and scalability features of the kernel.
+#
+# ( If MAXSMP is enabled we just use the highest possible value and disable
+#   interactive configuration. )
+#
+
+config NR_CPUS_RANGE_BEGIN
 	int
-	depends on X86_32
-	default 8 if SMP && !X86_BIGSMP
-	default 64 if SMP && X86_BIGSMP
-	default 1 if !SMP
+	default NR_CPUS_RANGE_END if MAXSMP
+	default    1 if !SMP
+	default    2
 
-config RANGE_END_CPUS
+config NR_CPUS_RANGE_END
 	int
-	depends on X86_64
-	default 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
-	default 8192 if SMP && (MAXSMP || CPUMASK_OFFSTACK)
-	default 1 if !SMP
+	depends on X86_32
+	default   64 if  SMP &&  X86_BIGSMP
+	default    8 if  SMP && !X86_BIGSMP
+	default    1 if !SMP
 
-config RANGE_BEGIN_CPUS
+config NR_CPUS_RANGE_END
 	int
-	default 1 if !SMP
-	default RANGE_END_CPUS if MAXSMP
-	default 2
+	depends on X86_64
+	default 8192 if  SMP && ( MAXSMP ||  CPUMASK_OFFSTACK)
+	default  512 if  SMP && (!MAXSMP && !CPUMASK_OFFSTACK)
+	default    1 if !SMP
 
-config DEF_CONFIG_CPUS
+config NR_CPUS_DEFAULT
 	int
 	depends on X86_32
-	default 1 if !SMP
-	default 32 if X86_BIGSMP
-	default 8 if SMP
+	default   32 if  X86_BIGSMP
+	default    8 if  SMP
+	default    1 if !SMP
 
-config DEF_CONFIG_CPUS
+config NR_CPUS_DEFAULT
 	int
 	depends on X86_64
-	default 1 if !SMP
-	default 8192 if MAXSMP
-	default 64 if SMP
+	default 8192 if  MAXSMP
+	default   64 if  SMP
+	default    1 if !SMP
 
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
-	range RANGE_BEGIN_CPUS RANGE_END_CPUS
-	default DEF_CONFIG_CPUS
+	range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END
+	default NR_CPUS_DEFAULT
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
 	  supported value is 8192, otherwise the maximum value is 512.  The
 	  minimum value which makes sense is 2.
 
-	  This is purely to save memory - each supported CPU adds
-	  approximately eight kilobytes to the kernel image.
+	  This is purely to save memory: each supported CPU adds about 8KB
+	  to the kernel image.
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"

From 79e902382637a2f421b7f295dcf9934d80d84d7d Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@redhat.com>
Date: Fri, 9 Feb 2018 17:01:14 +0100
Subject: [PATCH 492/676] Documentation/locking/mutex-design: Update to reflect
 latest changes

Commit 3ca0ff571b09 ("locking/mutex: Rework mutex::owner") reworked the
basic mutex implementation to deal with several problems. Documentation
was however left unchanged and became stale.

Update mutex-design.txt to reflect changes introduced by the above commit.

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/20180209160114.19980-1-juri.lelli@redhat.com
[ Small readability tweaks to the text. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/locking/mutex-design.txt | 49 +++++++++-----------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
index 60c482df1a38d..818aca19612f4 100644
--- a/Documentation/locking/mutex-design.txt
+++ b/Documentation/locking/mutex-design.txt
@@ -21,37 +21,23 @@ Implementation
 --------------
 
 Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use a three
-state atomic counter (->count) to represent the different possible
-transitions that can occur during the lifetime of a lock:
-
-	  1: unlocked
-	  0: locked, no waiters
-   negative: locked, with potential waiters
-
-In its most basic form it also includes a wait-queue and a spinlock
-that serializes access to it. CONFIG_SMP systems can also include
-a pointer to the lock task owner (->owner) as well as a spinner MCS
-lock (->osq), both described below in (ii).
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime.  Field owner
+actually contains 'struct task_struct *' to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty).  In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
 
 When acquiring a mutex, there are three possible paths that can be
 taken, depending on the state of the lock:
 
-(i) fastpath: tries to atomically acquire the lock by decrementing the
-    counter. If it was already taken by another task it goes to the next
-    possible path. This logic is architecture specific. On x86-64, the
-    locking fastpath is 2 instructions:
-
-    0000000000000e10 <mutex_lock>:
-    e21:   f0 ff 0b                lock decl (%rbx)
-    e24:   79 08                   jns    e2e <mutex_lock+0x1e>
-
-   the unlocking fastpath is equally tight:
-
-    0000000000000bc0 <mutex_unlock>:
-    bc8:   f0 ff 07                lock incl (%rdi)
-    bcb:   7f 0a                   jg     bd7 <mutex_unlock+0x17>
-
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+    the current task. This only works in the uncontended case (cmpxchg() checks
+    against 0UL, so all 3 state bits above have to be 0). If the lock is
+    contended it goes to the next possible path.
 
 (ii) midpath: aka optimistic spinning, tries to spin for acquisition
      while the lock owner is running and there are no other tasks ready
@@ -143,11 +129,10 @@ Test if the mutex is taken:
 Disadvantages
 -------------
 
-Unlike its original design and purpose, 'struct mutex' is larger than
-most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice
-as large as 'struct semaphore' (24 bytes) and tied, along with rwsems,
-for the largest lock in the kernel. Larger structure sizes mean more
-CPU cache and memory footprint.
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
 
 When to use mutexes
 -------------------

From 5d13c73179983f8692adf397e65f2f57c613a917 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Sun, 11 Feb 2018 22:59:18 +0800
Subject: [PATCH 493/676] nios2: dts: Remove leading 0x and 0s from bindings
 notation

Improve the DTS files by removing all the leading "0x" and zeros to fix the
following dtc warnings:

Warning (unit_address_format): Node /XXX unit name should not have leading "0x"

and

Warning (unit_address_format): Node /XXX unit name should not have leading 0s

Converted using the following command:

find . -type f \( -iname *.dts -o -iname *.dtsi \) -exec sed -E -i -e "s/@0x([0-9a-fA-F\.]+)\s?\{/@\L\1 \{/g" -e "s/@0+([0-9a-fA-F\.]+)\s?\{/@\L\1 \{/g" {} +

For simplicity, two sed expressions were used to solve each warnings separately.

To make the regex expression more robust a few other issues were resolved,
namely setting unit-address to lower case, and adding a whitespace before the
the opening curly brace:

https://elinux.org/Device_Tree_Linux#Linux_conventions

This is a follow up to commit 4c9847b7375a ("dt-bindings: Remove leading 0x from bindings notation")

Reported-by: David Daney <ddaney@caviumnetworks.com>
Suggested-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Ley Foon Tan <ley.foon.tan@intel.com>
---
 arch/nios2/boot/dts/3c120_devboard.dts | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/nios2/boot/dts/3c120_devboard.dts b/arch/nios2/boot/dts/3c120_devboard.dts
index 36ccdf05837de..56f4b5df6d650 100644
--- a/arch/nios2/boot/dts/3c120_devboard.dts
+++ b/arch/nios2/boot/dts/3c120_devboard.dts
@@ -29,7 +29,7 @@
 		#address-cells = <1>;
 		#size-cells = <0>;
 
-		cpu: cpu@0x0 {
+		cpu: cpu@0 {
 			device_type = "cpu";
 			compatible = "altr,nios2-1.0";
 			reg = <0x00000000>;
@@ -69,7 +69,7 @@
 		compatible = "altr,avalon", "simple-bus";
 		bus-frequency = <125000000>;
 
-		pb_cpu_to_io: bridge@0x8000000 {
+		pb_cpu_to_io: bridge@8000000 {
 			compatible = "simple-bus";
 			reg = <0x08000000 0x00800000>;
 			#address-cells = <1>;
@@ -83,7 +83,7 @@
 				<0x00008000 0x08008000 0x00000020>,
 				<0x00400000 0x08400000 0x00000020>;
 
-			timer_1ms: timer@0x400000 {
+			timer_1ms: timer@400000 {
 				compatible = "altr,timer-1.0";
 				reg = <0x00400000 0x00000020>;
 				interrupt-parent = <&cpu>;
@@ -91,7 +91,7 @@
 				clock-frequency = <125000000>;
 			};
 
-			timer_0: timer@0x8000 {
+			timer_0: timer@8000 {
 				compatible = "altr,timer-1.0";
 				reg = < 0x00008000 0x00000020 >;
 				interrupt-parent = < &cpu >;
@@ -99,14 +99,14 @@
 				clock-frequency = < 125000000 >;
 			};
 
-			jtag_uart: serial@0x4d50 {
+			jtag_uart: serial@4d50 {
 				compatible = "altr,juart-1.0";
 				reg = <0x00004d50 0x00000008>;
 				interrupt-parent = <&cpu>;
 				interrupts = <1>;
 			};
 
-			tse_mac: ethernet@0x4000 {
+			tse_mac: ethernet@4000 {
 				compatible = "altr,tse-1.0";
 				reg = <0x00004000 0x00000400>,
 					<0x00004400 0x00000040>,
@@ -133,7 +133,7 @@
 				};
 			};
 
-			uart: serial@0x4c80 {
+			uart: serial@4c80 {
 				compatible = "altr,uart-1.0";
 				reg = <0x00004c80 0x00000020>;
 				interrupt-parent = <&cpu>;
@@ -143,7 +143,7 @@
 			};
 		};
 
-		cfi_flash_64m: flash@0x0 {
+		cfi_flash_64m: flash@0 {
 			compatible = "cfi-flash";
 			reg = <0x00000000 0x04000000>;
 			bank-width = <2>;

From e0691ebb33c12084ef11b6b59228c004e19f59c8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Sun, 11 Feb 2018 23:01:17 +0800
Subject: [PATCH 494/676] nios2: defconfig: Cleanup from old Kconfig options

Remove old, dead Kconfig option INET_LRO. It is gone since
commit 7bbf3cae65b6 ("ipv4: Remove inet_lro library").

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Ley Foon Tan <ley.foon.tan@intel.com>
---
 arch/nios2/configs/10m50_defconfig | 1 -
 arch/nios2/configs/3c120_defconfig | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/nios2/configs/10m50_defconfig b/arch/nios2/configs/10m50_defconfig
index 8b2a30b3b34fe..c601c8ff1ae68 100644
--- a/arch/nios2/configs/10m50_defconfig
+++ b/arch/nios2/configs/10m50_defconfig
@@ -33,7 +33,6 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
diff --git a/arch/nios2/configs/3c120_defconfig b/arch/nios2/configs/3c120_defconfig
index 9451940678a03..fce33588d55cd 100644
--- a/arch/nios2/configs/3c120_defconfig
+++ b/arch/nios2/configs/3c120_defconfig
@@ -35,7 +35,6 @@ CONFIG_IP_PNP_RARP=y
 # CONFIG_INET_XFRM_MODE_TRANSPORT is not set
 # CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"

From f8d0cbf28df14b6b6801f027a396fd593971a922 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Sun, 11 Feb 2018 01:07:54 -0800
Subject: [PATCH 495/676] xtensa: fix build with KASAN

The commit 917538e212a2 ("kasan: clean up KASAN_SHADOW_SCALE_SHIFT
usage") removed KASAN_SHADOW_SCALE_SHIFT definition from
include/linux/kasan.h and added it to architecture-specific headers,
except for xtensa. This broke the xtensa build with KASAN enabled.
Define KASAN_SHADOW_SCALE_SHIFT in arch/xtensa/include/asm/kasan.h

Reported by: kbuild test robot <fengguang.wu@intel.com>
Fixes: 917538e212a2 ("kasan: clean up KASAN_SHADOW_SCALE_SHIFT usage")
Acked-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 arch/xtensa/include/asm/kasan.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/xtensa/include/asm/kasan.h b/arch/xtensa/include/asm/kasan.h
index 54be80876e578..216b6f32c3759 100644
--- a/arch/xtensa/include/asm/kasan.h
+++ b/arch/xtensa/include/asm/kasan.h
@@ -10,6 +10,8 @@
 #include <linux/sizes.h>
 #include <asm/kmem_layout.h>
 
+#define KASAN_SHADOW_SCALE_SHIFT 3
+
 /* Start of area covered by KASAN */
 #define KASAN_START_VADDR __XTENSA_UL_CONST(0x90000000)
 /* Start of the shadow map */

From a9a08845e9acbd224e4ee466f5c1275ed50054e8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 11 Feb 2018 14:34:03 -0800
Subject: [PATCH 496/676] vfs: do bulk POLL* -> EPOLL* replacement

This is the mindless scripted replacement of kernel use of POLL*
variables as described by Al, done by this script:

    for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do
        L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'`
        for f in $L; do sed -i "-es/^\([^\"]*\)\(\<POLL$V\>\)/\\1E\\2/" $f; done
    done

with de-mangling cleanups yet to come.

NOTE! On almost all architectures, the EPOLL* constants have the same
values as the POLL* constants do.  But they keyword here is "almost".
For various bad reasons they aren't the same, and epoll() doesn't
actually work quite correctly in some cases due to this on Sparc et al.

The next patch from Al will sort out the final differences, and we
should be all done.

Scripted-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/cris/arch-v10/drivers/gpio.c             |  2 +-
 arch/cris/arch-v10/drivers/sync_serial.c      |  8 ++--
 arch/cris/arch-v32/drivers/sync_serial.c      | 10 ++---
 arch/ia64/kernel/perfmon.c                    |  2 +-
 arch/mips/kernel/rtlx.c                       |  4 +-
 arch/powerpc/kernel/rtasd.c                   |  2 +-
 .../platforms/cell/spufs/backing_ops.c        |  8 ++--
 arch/powerpc/platforms/cell/spufs/file.c      | 10 ++---
 arch/powerpc/platforms/cell/spufs/hw_ops.c    |  8 ++--
 arch/powerpc/platforms/powernv/opal-prd.c     |  2 +-
 arch/x86/kernel/apm_32.c                      |  2 +-
 arch/x86/kernel/cpu/mcheck/dev-mcelog.c       |  4 +-
 block/bsg.c                                   |  4 +-
 crypto/af_alg.c                               | 16 ++++----
 drivers/acpi/acpi_dbg.c                       |  4 +-
 drivers/android/binder.c                      |  4 +-
 drivers/bluetooth/hci_vhci.c                  |  4 +-
 drivers/char/apm-emulation.c                  |  2 +-
 drivers/char/dsp56k.c                         |  2 +-
 drivers/char/dtlk.c                           |  6 +--
 drivers/char/hpet.c                           |  2 +-
 drivers/char/ipmi/bt-bmc.c                    |  4 +-
 drivers/char/ipmi/ipmi_devintf.c              |  2 +-
 drivers/char/ipmi/ipmi_watchdog.c             |  2 +-
 drivers/char/pcmcia/cm4040_cs.c               |  4 +-
 drivers/char/ppdev.c                          |  2 +-
 drivers/char/random.c                         |  4 +-
 drivers/char/rtc.c                            |  2 +-
 drivers/char/snsc.c                           |  4 +-
 drivers/char/sonypi.c                         |  2 +-
 drivers/char/tpm/tpm_vtpm_proxy.c             |  6 +--
 drivers/char/virtio_console.c                 |  8 ++--
 drivers/char/xillybus/xillybus_core.c         | 12 +++---
 drivers/dma-buf/dma-buf.c                     | 26 ++++++------
 drivers/dma-buf/sync_file.c                   |  2 +-
 drivers/firewire/core-cdev.c                  |  4 +-
 drivers/firewire/nosy.c                       |  4 +-
 drivers/gpio/gpiolib.c                        |  4 +-
 drivers/gpu/drm/drm_file.c                    |  2 +-
 drivers/gpu/drm/i915/i915_perf.c              | 10 ++---
 drivers/gpu/vga/vgaarb.c                      |  2 +-
 drivers/hid/hid-debug.c                       |  4 +-
 drivers/hid/hid-roccat.c                      |  4 +-
 drivers/hid/hid-sensor-custom.c               |  2 +-
 drivers/hid/hidraw.c                          |  4 +-
 drivers/hid/uhid.c                            |  2 +-
 drivers/hid/usbhid/hiddev.c                   |  4 +-
 drivers/hsi/clients/cmt_speech.c              |  4 +-
 drivers/hv/hv_utils_transport.c               |  4 +-
 drivers/iio/buffer/industrialio-buffer-dma.c  |  4 +-
 drivers/iio/industrialio-buffer.c             |  6 +--
 drivers/iio/industrialio-event.c              |  6 +--
 drivers/infiniband/core/ucm.c                 |  2 +-
 drivers/infiniband/core/ucma.c                |  2 +-
 drivers/infiniband/core/user_mad.c            |  4 +-
 drivers/infiniband/core/uverbs_main.c         |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c         |  8 ++--
 drivers/infiniband/hw/qib/qib_file_ops.c      |  8 ++--
 drivers/infiniband/ulp/iser/iscsi_iser.c      |  2 +-
 drivers/input/evdev.c                         |  6 +--
 drivers/input/input.c                         |  2 +-
 drivers/input/joydev.c                        |  4 +-
 drivers/input/misc/hp_sdc_rtc.c               |  2 +-
 drivers/input/misc/uinput.c                   |  2 +-
 drivers/input/mousedev.c                      |  4 +-
 drivers/input/serio/serio_raw.c               |  4 +-
 drivers/input/serio/userio.c                  |  2 +-
 drivers/isdn/capi/capi.c                      |  6 +--
 drivers/isdn/divert/divert_procfs.c           |  4 +-
 drivers/isdn/hardware/eicon/divamnt.c         |  4 +-
 drivers/isdn/hardware/eicon/divasi.c          | 10 ++---
 drivers/isdn/hardware/eicon/divasmain.c       |  4 +-
 drivers/isdn/hardware/eicon/divasproc.c       |  2 +-
 drivers/isdn/hysdn/hysdn_proclog.c            |  2 +-
 drivers/isdn/i4l/isdn_common.c                | 12 +++---
 drivers/isdn/i4l/isdn_ppp.c                   |  8 ++--
 drivers/isdn/mISDN/timerdev.c                 |  4 +-
 drivers/leds/uleds.c                          |  2 +-
 drivers/macintosh/smu.c                       |  2 +-
 drivers/macintosh/via-pmu.c                   |  2 +-
 drivers/mailbox/mailbox-test.c                |  2 +-
 drivers/md/dm-ioctl.c                         |  2 +-
 drivers/md/md.c                               |  6 +--
 drivers/media/cec/cec-api.c                   |  8 ++--
 drivers/media/common/saa7146/saa7146_fops.c   |  6 +--
 drivers/media/common/siano/smsdvb-debugfs.c   |  2 +-
 .../media/common/videobuf2/videobuf2-core.c   | 30 +++++++-------
 .../media/common/videobuf2/videobuf2-v4l2.c   |  8 ++--
 drivers/media/dvb-core/dmxdev.c               | 14 +++----
 drivers/media/dvb-core/dvb_ca_en50221.c       |  4 +-
 drivers/media/dvb-core/dvb_frontend.c         |  2 +-
 drivers/media/firewire/firedtv-ci.c           |  2 +-
 drivers/media/i2c/saa6588.c                   |  2 +-
 drivers/media/media-devnode.c                 |  2 +-
 drivers/media/pci/bt8xx/bttv-driver.c         | 22 +++++-----
 drivers/media/pci/cx18/cx18-fileops.c         | 16 ++++----
 drivers/media/pci/ddbridge/ddbridge-core.c    |  4 +-
 drivers/media/pci/ivtv/ivtv-fileops.c         | 16 ++++----
 drivers/media/pci/meye/meye.c                 |  2 +-
 drivers/media/pci/saa7164/saa7164-encoder.c   |  6 +--
 drivers/media/pci/saa7164/saa7164-vbi.c       |  2 +-
 drivers/media/pci/ttpci/av7110_av.c           | 10 ++---
 drivers/media/pci/ttpci/av7110_ca.c           |  4 +-
 drivers/media/pci/zoran/zoran_driver.c        | 16 ++++----
 drivers/media/platform/fsl-viu.c              |  4 +-
 drivers/media/platform/s5p-mfc/s5p_mfc.c      |  8 ++--
 .../media/platform/soc_camera/soc_camera.c    |  4 +-
 drivers/media/platform/vivid/vivid-radio-rx.c |  2 +-
 drivers/media/platform/vivid/vivid-radio-tx.c |  2 +-
 drivers/media/radio/radio-cadet.c             |  4 +-
 drivers/media/radio/radio-si476x.c            |  6 +--
 drivers/media/radio/radio-wl1273.c            |  4 +-
 .../media/radio/si470x/radio-si470x-common.c  |  4 +-
 drivers/media/radio/wl128x/fmdrv_v4l2.c       |  2 +-
 drivers/media/rc/lirc_dev.c                   | 12 +++---
 drivers/media/usb/cpia2/cpia2_core.c          |  4 +-
 drivers/media/usb/cx231xx/cx231xx-417.c       |  4 +-
 drivers/media/usb/cx231xx/cx231xx-video.c     | 10 ++---
 drivers/media/usb/gspca/gspca.c               | 12 +++---
 drivers/media/usb/hdpvr/hdpvr-video.c         |  4 +-
 drivers/media/usb/pvrusb2/pvrusb2-v4l2.c      |  6 +--
 drivers/media/usb/stkwebcam/stk-webcam.c      |  4 +-
 drivers/media/usb/tm6000/tm6000-video.c       | 14 +++----
 drivers/media/v4l2-core/v4l2-ctrls.c          |  2 +-
 drivers/media/v4l2-core/v4l2-dev.c            |  2 +-
 drivers/media/v4l2-core/v4l2-mem2mem.c        | 14 +++----
 drivers/media/v4l2-core/v4l2-subdev.c         |  4 +-
 drivers/media/v4l2-core/videobuf-core.c       | 10 ++---
 drivers/mfd/ab8500-debugfs.c                  |  2 +-
 drivers/misc/cxl/file.c                       |  4 +-
 drivers/misc/hpilo.c                          |  4 +-
 drivers/misc/lis3lv02d/lis3lv02d.c            |  2 +-
 drivers/misc/mei/main.c                       | 12 +++---
 drivers/misc/mic/cosm/cosm_scif_server.c      | 16 ++++----
 .../misc/mic/cosm_client/cosm_scif_client.c   |  4 +-
 drivers/misc/mic/scif/scif_api.c              | 24 +++++------
 drivers/misc/mic/vop/vop_vringh.c             |  8 ++--
 drivers/misc/ocxl/file.c                      |  4 +-
 drivers/misc/phantom.c                        |  4 +-
 drivers/misc/vmw_vmci/vmci_host.c             |  2 +-
 drivers/net/ieee802154/ca8210.c               |  4 +-
 drivers/net/ppp/ppp_generic.c                 |  8 ++--
 drivers/net/tap.c                             | 10 ++---
 drivers/net/tun.c                             | 12 +++---
 .../net/wireless/ralink/rt2x00/rt2x00debug.c  |  2 +-
 drivers/pci/switch/switchtec.c                |  6 +--
 drivers/platform/chrome/cros_ec_debugfs.c     |  2 +-
 drivers/platform/goldfish/goldfish_pipe.c     |  8 ++--
 drivers/platform/x86/sony-laptop.c            |  2 +-
 drivers/pps/pps.c                             |  2 +-
 drivers/ptp/ptp_chardev.c                     |  2 +-
 drivers/rapidio/devices/rio_mport_cdev.c      |  2 +-
 drivers/rpmsg/qcom_smd.c                      |  2 +-
 drivers/rpmsg/rpmsg_char.c                    |  4 +-
 drivers/rtc/rtc-dev.c                         |  2 +-
 drivers/s390/block/dasd_eer.c                 |  2 +-
 drivers/s390/char/monreader.c                 |  4 +-
 drivers/scsi/megaraid/megaraid_sas_base.c     |  2 +-
 drivers/scsi/mpt3sas/mpt3sas_ctl.c            |  2 +-
 drivers/scsi/sg.c                             | 12 +++---
 drivers/staging/comedi/comedi_fops.c          |  4 +-
 drivers/staging/comedi/drivers/serial2002.c   |  4 +-
 drivers/staging/fwserial/fwserial.c           |  2 +-
 drivers/staging/greybus/tools/loopback_test.c |  4 +-
 drivers/staging/irda/net/af_irda.c            | 14 +++----
 drivers/staging/irda/net/irnet/irnet_ppp.c    |  8 ++--
 .../media/atomisp/pci/atomisp2/atomisp_fops.c |  2 +-
 drivers/staging/media/bcm2048/radio-bcm2048.c |  2 +-
 drivers/staging/most/cdev/cdev.c              |  4 +-
 drivers/staging/most/video/video.c            |  2 +-
 drivers/staging/speakup/speakup_soft.c        |  2 +-
 drivers/tty/n_gsm.c                           |  6 +--
 drivers/tty/n_hdlc.c                          |  8 ++--
 drivers/tty/n_r3964.c                         |  4 +-
 drivers/tty/n_tty.c                           | 16 ++++----
 drivers/tty/pty.c                             |  4 +-
 drivers/tty/tty_io.c                          | 14 +++----
 drivers/tty/tty_ldisc.c                       |  4 +-
 drivers/tty/vt/vc_screen.c                    |  2 +-
 drivers/uio/uio.c                             |  2 +-
 drivers/usb/class/cdc-wdm.c                   |  8 ++--
 drivers/usb/class/usblp.c                     |  4 +-
 drivers/usb/class/usbtmc.c                    |  4 +-
 drivers/usb/core/devices.c                    |  2 +-
 drivers/usb/core/devio.c                      |  6 +--
 drivers/usb/gadget/function/f_fs.c            |  8 ++--
 drivers/usb/gadget/function/f_hid.c           |  4 +-
 drivers/usb/gadget/function/f_printer.c       |  4 +-
 drivers/usb/gadget/legacy/inode.c             |  6 +--
 drivers/usb/misc/iowarrior.c                  |  8 ++--
 drivers/usb/misc/ldusb.c                      |  6 +--
 drivers/usb/misc/legousbtower.c               |  6 +--
 drivers/usb/mon/mon_bin.c                     |  2 +-
 drivers/vfio/virqfd.c                         |  8 ++--
 drivers/vhost/net.c                           |  4 +-
 drivers/vhost/vhost.c                         | 10 ++---
 drivers/virt/fsl_hypervisor.c                 |  2 +-
 drivers/xen/evtchn.c                          |  6 +--
 drivers/xen/mcelog.c                          |  2 +-
 drivers/xen/pvcalls-front.c                   | 14 +++----
 drivers/xen/xenbus/xenbus_dev_frontend.c      |  2 +-
 fs/cachefiles/daemon.c                        |  6 +--
 fs/coda/psdev.c                               |  4 +-
 fs/debugfs/file.c                             |  2 +-
 fs/dlm/plock.c                                |  2 +-
 fs/dlm/user.c                                 |  2 +-
 fs/ecryptfs/miscdev.c                         |  2 +-
 fs/eventfd.c                                  | 18 ++++-----
 fs/eventpoll.c                                | 22 +++++-----
 fs/fcntl.c                                    | 12 +++---
 fs/fuse/dev.c                                 |  8 ++--
 fs/fuse/file.c                                |  2 +-
 fs/kernfs/file.c                              |  4 +-
 fs/notify/fanotify/fanotify_user.c            |  2 +-
 fs/notify/inotify/inotify_user.c              |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c                        |  6 +--
 fs/orangefs/devorangefs-req.c                 |  2 +-
 fs/pipe.c                                     | 22 +++++-----
 fs/proc/kmsg.c                                |  2 +-
 fs/proc/proc_sysctl.c                         |  4 +-
 fs/proc_namespace.c                           |  4 +-
 fs/select.c                                   | 10 ++---
 fs/signalfd.c                                 |  4 +-
 fs/timerfd.c                                  |  2 +-
 fs/userfaultfd.c                              | 16 ++++----
 include/linux/scif.h                          | 16 ++++----
 include/media/videobuf2-core.h                |  6 +--
 include/net/inet_connection_sock.h            |  2 +-
 ipc/mqueue.c                                  |  4 +-
 kernel/events/core.c                          |  2 +-
 kernel/events/ring_buffer.c                   |  2 +-
 kernel/printk/printk.c                        |  6 +--
 kernel/relay.c                                |  4 +-
 kernel/time/posix-clock.c                     |  2 +-
 kernel/trace/ring_buffer.c                    |  4 +-
 kernel/trace/trace.c                          |  4 +-
 mm/memcontrol.c                               |  4 +-
 mm/swapfile.c                                 |  4 +-
 net/9p/trans_fd.c                             | 26 ++++++------
 net/atm/common.c                              |  8 ++--
 net/batman-adv/icmp_socket.c                  |  2 +-
 net/batman-adv/log.c                          |  2 +-
 net/bluetooth/af_bluetooth.c                  | 16 ++++----
 net/caif/caif_socket.c                        | 12 +++---
 net/core/datagram.c                           | 16 ++++----
 net/core/sock.c                               | 10 ++---
 net/core/stream.c                             |  4 +-
 net/dccp/proto.c                              | 12 +++---
 net/decnet/af_decnet.c                        |  2 +-
 net/ipv4/af_inet.c                            |  2 +-
 net/ipv4/tcp.c                                | 34 ++++++++--------
 net/ipv4/tcp_input.c                          |  2 +-
 net/ipv4/udp.c                                |  6 +--
 net/iucv/af_iucv.c                            | 18 ++++-----
 net/kcm/kcmsock.c                             |  6 +--
 net/nfc/llcp_sock.c                           | 16 ++++----
 net/packet/af_packet.c                        |  4 +-
 net/phonet/socket.c                           | 10 ++---
 net/rds/af_rds.c                              | 16 ++++----
 net/rfkill/core.c                             |  4 +-
 net/rxrpc/af_rxrpc.c                          |  4 +-
 net/sctp/socket.c                             | 20 +++++-----
 net/smc/af_smc.c                              | 24 +++++------
 net/smc/smc_rx.c                              |  4 +-
 net/smc/smc_tx.c                              |  4 +-
 net/sunrpc/cache.c                            |  4 +-
 net/sunrpc/rpc_pipe.c                         |  6 +--
 net/tipc/socket.c                             | 22 +++++-----
 net/unix/af_unix.c                            | 40 +++++++++----------
 net/vmw_vsock/af_vsock.c                      | 30 +++++++-------
 security/apparmor/apparmorfs.c                |  2 +-
 security/tomoyo/audit.c                       |  6 +--
 security/tomoyo/common.c                      | 14 +++----
 security/tomoyo/securityfs_if.c               |  4 +-
 sound/core/compress_offload.c                 | 10 ++---
 sound/core/control.c                          |  2 +-
 sound/core/info.c                             |  4 +-
 sound/core/init.c                             |  2 +-
 sound/core/oss/pcm_oss.c                      |  4 +-
 sound/core/pcm_native.c                       | 14 +++----
 sound/core/rawmidi.c                          |  4 +-
 sound/core/seq/oss/seq_oss_rw.c               |  4 +-
 sound/core/seq/seq_clientmgr.c                |  4 +-
 sound/core/timer.c                            |  4 +-
 sound/firewire/bebob/bebob_hwdep.c            |  2 +-
 sound/firewire/dice/dice-hwdep.c              |  2 +-
 sound/firewire/digi00x/digi00x-hwdep.c        |  2 +-
 sound/firewire/fireface/ff-hwdep.c            |  2 +-
 sound/firewire/fireworks/fireworks_hwdep.c    |  4 +-
 sound/firewire/motu/motu-hwdep.c              |  4 +-
 sound/firewire/oxfw/oxfw-hwdep.c              |  2 +-
 sound/firewire/tascam/tascam-hwdep.c          |  2 +-
 sound/oss/dmasound/dmasound_core.c            |  2 +-
 sound/usb/mixer_quirks.c                      |  2 +-
 sound/usb/usx2y/us122l.c                      |  4 +-
 sound/usb/usx2y/usX2Yhwdep.c                  |  4 +-
 virt/kvm/eventfd.c                            |  8 ++--
 297 files changed, 913 insertions(+), 913 deletions(-)

diff --git a/arch/cris/arch-v10/drivers/gpio.c b/arch/cris/arch-v10/drivers/gpio.c
index a2986c60aaac2..cd0e05d89d42f 100644
--- a/arch/cris/arch-v10/drivers/gpio.c
+++ b/arch/cris/arch-v10/drivers/gpio.c
@@ -173,7 +173,7 @@ static __poll_t gpio_poll(struct file *file, poll_table *wait)
 
 	if ((data & priv->highalarm) ||
 	    (~data & priv->lowalarm)) {
-		mask = POLLIN|POLLRDNORM;
+		mask = EPOLLIN|EPOLLRDNORM;
 	}
 
 out:
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 177843c640715..ed1a568a72170 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -666,16 +666,16 @@ static __poll_t sync_serial_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &port->in_wait_q, wait);
 	/* Some room to write */
 	if (port->out_count < OUT_BUFFER_SIZE)
-		mask |=  POLLOUT | POLLWRNORM;
+		mask |=  EPOLLOUT | EPOLLWRNORM;
 	/* At least an inbufchunk of data */
 	if (sync_data_avail(port) >= port->inbufchunk)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	DEBUGPOLL(if (mask != prev_mask)
 		printk(KERN_DEBUG "sync_serial_poll: mask 0x%08X %s %s\n",
 			mask,
-			mask & POLLOUT ? "POLLOUT" : "",
-			mask & POLLIN ? "POLLIN" : "");
+			mask & EPOLLOUT ? "POLLOUT" : "",
+			mask & EPOLLIN ? "POLLIN" : "");
 		prev_mask = mask;
 	);
 	return mask;
diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c
index e20e0b9a3a5ce..1b0ce8a8af167 100644
--- a/arch/cris/arch-v32/drivers/sync_serial.c
+++ b/arch/cris/arch-v32/drivers/sync_serial.c
@@ -574,24 +574,24 @@ static __poll_t sync_serial_poll(struct file *file, poll_table *wait)
 
 	/* No active transfer, descriptors are available */
 	if (port->output && !port->tr_running)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	/* Descriptor and buffer space available. */
 	if (port->output &&
 	    port->active_tr_descr != port->catch_tr_descr &&
 	    port->out_buf_count < OUT_BUFFER_SIZE)
-		mask |=  POLLOUT | POLLWRNORM;
+		mask |=  EPOLLOUT | EPOLLWRNORM;
 
 	/* At least an inbufchunk of data */
 	if (port->input && sync_data_avail(port) >= port->inbufchunk)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	DEBUGPOLL(
 	if (mask != prev_mask)
 		pr_info("sync_serial_poll: mask 0x%08X %s %s\n",
 			mask,
-			mask & POLLOUT ? "POLLOUT" : "",
-			mask & POLLIN ? "POLLIN" : "");
+			mask & EPOLLOUT ? "POLLOUT" : "",
+			mask & EPOLLIN ? "POLLIN" : "");
 		prev_mask = mask;
 	);
 	return mask;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 8586024940963..8fb280e331141 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -1670,7 +1670,7 @@ pfm_poll(struct file *filp, poll_table * wait)
 	PROTECT_CTX(ctx, flags);
 
 	if (PFM_CTXQ_EMPTY(ctx) == 0)
-		mask =  POLLIN | POLLRDNORM;
+		mask =  EPOLLIN | EPOLLRDNORM;
 
 	UNPROTECT_CTX(ctx, flags);
 
diff --git a/arch/mips/kernel/rtlx.c b/arch/mips/kernel/rtlx.c
index bbb0f4770c0d8..18c509c59f338 100644
--- a/arch/mips/kernel/rtlx.c
+++ b/arch/mips/kernel/rtlx.c
@@ -349,11 +349,11 @@ static __poll_t file_poll(struct file *file, poll_table *wait)
 
 	/* data available to read? */
 	if (rtlx_read_poll(minor, 0))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* space to write */
 	if (rtlx_write_poll(minor))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	return mask;
 }
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index fc600a8b1e77b..f915db93cd429 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -392,7 +392,7 @@ static __poll_t rtas_log_poll(struct file *file, poll_table * wait)
 {
 	poll_wait(file, &rtas_log_wait, wait);
 	if (rtas_log_size)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/spufs/backing_ops.c b/arch/powerpc/platforms/cell/spufs/backing_ops.c
index 1a9a756b0b2f7..857580a78bbd8 100644
--- a/arch/powerpc/platforms/cell/spufs/backing_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/backing_ops.c
@@ -101,9 +101,9 @@ static __poll_t spu_backing_mbox_stat_poll(struct spu_context *ctx,
 	   but first mark any pending interrupts as done so
 	   we don't get woken up unnecessarily */
 
-	if (events & (POLLIN | POLLRDNORM)) {
+	if (events & (EPOLLIN | EPOLLRDNORM)) {
 		if (stat & 0xff0000)
-			ret |= POLLIN | POLLRDNORM;
+			ret |= EPOLLIN | EPOLLRDNORM;
 		else {
 			ctx->csa.priv1.int_stat_class2_RW &=
 				~CLASS2_MAILBOX_INTR;
@@ -111,9 +111,9 @@ static __poll_t spu_backing_mbox_stat_poll(struct spu_context *ctx,
 				CLASS2_ENABLE_MAILBOX_INTR;
 		}
 	}
-	if (events & (POLLOUT | POLLWRNORM)) {
+	if (events & (EPOLLOUT | EPOLLWRNORM)) {
 		if (stat & 0x00ff00)
-			ret = POLLOUT | POLLWRNORM;
+			ret = EPOLLOUT | EPOLLWRNORM;
 		else {
 			ctx->csa.priv1.int_stat_class2_RW &=
 				~CLASS2_MAILBOX_THRESHOLD_INTR;
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index c1be486da8993..469bdd0b748f7 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -774,7 +774,7 @@ static __poll_t spufs_ibox_poll(struct file *file, poll_table *wait)
 	 * that poll should not sleep.  Will be fixed later.
 	 */
 	mutex_lock(&ctx->state_mutex);
-	mask = ctx->ops->mbox_stat_poll(ctx, POLLIN | POLLRDNORM);
+	mask = ctx->ops->mbox_stat_poll(ctx, EPOLLIN | EPOLLRDNORM);
 	spu_release(ctx);
 
 	return mask;
@@ -910,7 +910,7 @@ static __poll_t spufs_wbox_poll(struct file *file, poll_table *wait)
 	 * that poll should not sleep.  Will be fixed later.
 	 */
 	mutex_lock(&ctx->state_mutex);
-	mask = ctx->ops->mbox_stat_poll(ctx, POLLOUT | POLLWRNORM);
+	mask = ctx->ops->mbox_stat_poll(ctx, EPOLLOUT | EPOLLWRNORM);
 	spu_release(ctx);
 
 	return mask;
@@ -1710,9 +1710,9 @@ static __poll_t spufs_mfc_poll(struct file *file,poll_table *wait)
 
 	mask = 0;
 	if (free_elements & 0xffff)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (tagstatus & ctx->tagwait)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	pr_debug("%s: free %d tagstatus %d tagwait %d\n", __func__,
 		free_elements, tagstatus, ctx->tagwait);
@@ -2469,7 +2469,7 @@ static __poll_t spufs_switch_log_poll(struct file *file, poll_table *wait)
 		return rc;
 
 	if (spufs_switch_log_used(ctx) > 0)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	spu_release(ctx);
 
diff --git a/arch/powerpc/platforms/cell/spufs/hw_ops.c b/arch/powerpc/platforms/cell/spufs/hw_ops.c
index fff58198b5b6e..ae9d24d31eed8 100644
--- a/arch/powerpc/platforms/cell/spufs/hw_ops.c
+++ b/arch/powerpc/platforms/cell/spufs/hw_ops.c
@@ -70,17 +70,17 @@ static __poll_t spu_hw_mbox_stat_poll(struct spu_context *ctx, __poll_t events)
 	   but first mark any pending interrupts as done so
 	   we don't get woken up unnecessarily */
 
-	if (events & (POLLIN | POLLRDNORM)) {
+	if (events & (EPOLLIN | EPOLLRDNORM)) {
 		if (stat & 0xff0000)
-			ret |= POLLIN | POLLRDNORM;
+			ret |= EPOLLIN | EPOLLRDNORM;
 		else {
 			spu_int_stat_clear(spu, 2, CLASS2_MAILBOX_INTR);
 			spu_int_mask_or(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
 		}
 	}
-	if (events & (POLLOUT | POLLWRNORM)) {
+	if (events & (EPOLLOUT | EPOLLWRNORM)) {
 		if (stat & 0x00ff00)
-			ret = POLLOUT | POLLWRNORM;
+			ret = EPOLLOUT | EPOLLWRNORM;
 		else {
 			spu_int_stat_clear(spu, 2,
 					CLASS2_MAILBOX_THRESHOLD_INTR);
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index c18de0a9b1bdb..4070bb4e9da4a 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -153,7 +153,7 @@ static __poll_t opal_prd_poll(struct file *file,
 	poll_wait(file, &opal_prd_msg_wait, wait);
 
 	if (!opal_msg_queue_empty())
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index dc0ca8e29c750..dfcbe6924eafa 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1515,7 +1515,7 @@ static __poll_t do_poll(struct file *fp, poll_table *wait)
 		return 0;
 	poll_wait(fp, &apm_waitqueue, wait);
 	if (!queue_empty(as))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
index 213e8c2ca702f..97685a0c31751 100644
--- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -247,9 +247,9 @@ static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_chrdev_wait, wait);
 	if (READ_ONCE(mcelog.next))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/block/bsg.c b/block/bsg.c
index 2e2c1e2222097..06dc96e1f6700 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -849,9 +849,9 @@ static __poll_t bsg_poll(struct file *file, poll_table *wait)
 
 	spin_lock_irq(&bd->lock);
 	if (!list_empty(&bd->done_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (bd->queued_cmds < bd->max_queue)
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 	spin_unlock_irq(&bd->lock);
 
 	return mask;
diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 0f8d8d5523c31..c49766b03165c 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -735,9 +735,9 @@ void af_alg_wmem_wakeup(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
-							   POLLRDNORM |
-							   POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
+							   EPOLLRDNORM |
+							   EPOLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
@@ -800,9 +800,9 @@ void af_alg_data_wakeup(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
-							   POLLRDNORM |
-							   POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+							   EPOLLRDNORM |
+							   EPOLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	rcu_read_unlock();
 }
@@ -1076,10 +1076,10 @@ __poll_t af_alg_poll(struct file *file, struct socket *sock,
 	mask = 0;
 
 	if (!ctx->more || ctx->used)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (af_alg_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	return mask;
 }
diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c
index 2ff5c8c04e3b9..f21c99ec46ee0 100644
--- a/drivers/acpi/acpi_dbg.c
+++ b/drivers/acpi/acpi_dbg.c
@@ -724,9 +724,9 @@ static __poll_t acpi_aml_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &acpi_aml_io.wait, wait);
 	if (acpi_aml_user_readable())
-		masks |= POLLIN | POLLRDNORM;
+		masks |= EPOLLIN | EPOLLRDNORM;
 	if (acpi_aml_user_writable())
-		masks |= POLLOUT | POLLWRNORM;
+		masks |= EPOLLOUT | EPOLLWRNORM;
 
 	return masks;
 }
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index d21040c5d343f..15e3d3c2260dd 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4371,7 +4371,7 @@ static int binder_thread_release(struct binder_proc *proc,
 	 */
 	if ((thread->looper & BINDER_LOOPER_STATE_POLL) &&
 	    waitqueue_active(&thread->wait)) {
-		wake_up_poll(&thread->wait, POLLHUP | POLLFREE);
+		wake_up_poll(&thread->wait, EPOLLHUP | POLLFREE);
 	}
 
 	binder_inner_proc_unlock(thread->proc);
@@ -4401,7 +4401,7 @@ static __poll_t binder_poll(struct file *filp,
 	poll_wait(filp, &thread->wait, wait);
 
 	if (binder_has_work(thread, wait_for_proc_work))
-		return POLLIN;
+		return EPOLLIN;
 
 	return 0;
 }
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index 0521748a1972a..22f9145a426fd 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -306,9 +306,9 @@ static __poll_t vhci_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &data->read_wait, wait);
 
 	if (!skb_queue_empty(&data->readq))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
-	return POLLOUT | POLLWRNORM;
+	return EPOLLOUT | EPOLLWRNORM;
 }
 
 static void vhci_open_timeout(struct work_struct *work)
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index a2a1c1478cd08..a5e2f9e557eaa 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -241,7 +241,7 @@ static __poll_t apm_poll(struct file *fp, poll_table * wait)
 	struct apm_user *as = fp->private_data;
 
 	poll_wait(fp, &apm_waitqueue, wait);
-	return queue_empty(&as->queue) ? 0 : POLLIN | POLLRDNORM;
+	return queue_empty(&as->queue) ? 0 : EPOLLIN | EPOLLRDNORM;
 }
 
 /*
diff --git a/drivers/char/dsp56k.c b/drivers/char/dsp56k.c
index 2f92cc46698b2..06749e295adaf 100644
--- a/drivers/char/dsp56k.c
+++ b/drivers/char/dsp56k.c
@@ -414,7 +414,7 @@ static __poll_t dsp56k_poll(struct file *file, poll_table *wait)
 	{
 	case DSP56K_DEV_56001:
 		/* poll_wait(file, ???, wait); */
-		return POLLIN | POLLRDNORM | POLLOUT;
+		return EPOLLIN | EPOLLRDNORM | EPOLLOUT;
 
 	default:
 		printk("DSP56k driver: Unknown minor device: %d\n", dev);
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 2697c22e3be25..f882460b5a442 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -62,7 +62,7 @@
 #include <linux/uaccess.h>	/* for get_user, etc. */
 #include <linux/wait.h>		/* for wait_queue */
 #include <linux/init.h>		/* for __init, module_{init,exit} */
-#include <linux/poll.h>		/* for POLLIN, etc. */
+#include <linux/poll.h>		/* for EPOLLIN, etc. */
 #include <linux/dtlk.h>		/* local header file for DoubleTalk values */
 
 #ifdef TRACING
@@ -244,11 +244,11 @@ static __poll_t dtlk_poll(struct file *file, poll_table * wait)
 
 	if (dtlk_has_indexing && dtlk_readable()) {
 	        del_timer(&dtlk_timer);
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 	}
 	if (dtlk_writeable()) {
 	        del_timer(&dtlk_timer);
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	/* there are no exception conditions */
 
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index dbed4953f86cd..be426eb2a3535 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -359,7 +359,7 @@ static __poll_t hpet_poll(struct file *file, poll_table * wait)
 	spin_unlock_irq(&hpet_lock);
 
 	if (v != 0)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index 7992c870b0a21..c95b93b7598ba 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -349,10 +349,10 @@ static __poll_t bt_bmc_poll(struct file *file, poll_table *wait)
 	ctrl = bt_inb(bt_bmc, BT_CTRL);
 
 	if (ctrl & BT_CTRL_H2B_ATN)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	if (!(ctrl & (BT_CTRL_H_BUSY | BT_CTRL_B2H_ATN)))
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 
 	return mask;
 }
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index a011a7739f5e7..5f1bc91747358 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -89,7 +89,7 @@ static __poll_t ipmi_poll(struct file *file, poll_table *wait)
 	spin_lock_irqsave(&priv->recv_msg_lock, flags);
 
 	if (!list_empty(&(priv->recv_msgs)))
-		mask |= (POLLIN | POLLRDNORM);
+		mask |= (EPOLLIN | EPOLLRDNORM);
 
 	spin_unlock_irqrestore(&priv->recv_msg_lock, flags);
 
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 34bc1f3ca4143..a58acdcf74146 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -895,7 +895,7 @@ static __poll_t ipmi_poll(struct file *file, poll_table *wait)
 
 	spin_lock(&ipmi_read_lock);
 	if (data_to_read)
-		mask |= (POLLIN | POLLRDNORM);
+		mask |= (EPOLLIN | EPOLLRDNORM);
 	spin_unlock(&ipmi_read_lock);
 
 	return mask;
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index 819fe37a3683b..f80965407d3cc 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -423,9 +423,9 @@ static __poll_t cm4040_poll(struct file *filp, poll_table *wait)
 	poll_wait(filp, &dev->poll_wait, wait);
 
 	if (test_and_clear_bit(BS_READABLE, &dev->buffer_status))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (test_and_clear_bit(BS_WRITABLE, &dev->buffer_status))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	DEBUGP(2, dev, "<- cm4040_poll(%u)\n", mask);
 
diff --git a/drivers/char/ppdev.c b/drivers/char/ppdev.c
index 7a56d1a13ec3a..1ae77b41050ab 100644
--- a/drivers/char/ppdev.c
+++ b/drivers/char/ppdev.c
@@ -776,7 +776,7 @@ static __poll_t pp_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &pp->irq_wait, wait);
 	if (atomic_read(&pp->irqc))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 80f2c326db47d..e5b3d3ba46604 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1793,9 +1793,9 @@ random_poll(struct file *file, poll_table * wait)
 	poll_wait(file, &random_write_wait, wait);
 	mask = 0;
 	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 }
 
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index c6a317120a550..0c858d027bf3d 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -804,7 +804,7 @@ static __poll_t rtc_poll(struct file *file, poll_table *wait)
 	spin_unlock_irq(&rtc_lock);
 
 	if (l != 0)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 #endif
diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c
index 7f49fa0f41d77..5918ea7499bb1 100644
--- a/drivers/char/snsc.c
+++ b/drivers/char/snsc.c
@@ -340,10 +340,10 @@ scdrv_poll(struct file *file, struct poll_table_struct *wait)
 
 	if (status > 0) {
 		if (status & SAL_IROUTER_INTR_RECV) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		}
 		if (status & SAL_IROUTER_INTR_XMIT) {
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 		}
 	}
 
diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index fc041c462aa4e..186689833231d 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -944,7 +944,7 @@ static __poll_t sonypi_misc_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &sonypi_device.fifo_proc_list, wait);
 	if (kfifo_len(&sonypi_device.fifo))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 674218b50b13b..e4f79f9204507 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -180,15 +180,15 @@ static __poll_t vtpm_proxy_fops_poll(struct file *filp, poll_table *wait)
 
 	poll_wait(filp, &proxy_dev->wq, wait);
 
-	ret = POLLOUT;
+	ret = EPOLLOUT;
 
 	mutex_lock(&proxy_dev->buf_lock);
 
 	if (proxy_dev->req_len)
-		ret |= POLLIN | POLLRDNORM;
+		ret |= EPOLLIN | EPOLLRDNORM;
 
 	if (!(proxy_dev->state & STATE_OPENED_FLAG))
-		ret |= POLLHUP;
+		ret |= EPOLLHUP;
 
 	mutex_unlock(&proxy_dev->buf_lock);
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 813a2e46824d4..468f061340126 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -992,15 +992,15 @@ static __poll_t port_fops_poll(struct file *filp, poll_table *wait)
 
 	if (!port->guest_connected) {
 		/* Port got unplugged */
-		return POLLHUP;
+		return EPOLLHUP;
 	}
 	ret = 0;
 	if (!will_read_block(port))
-		ret |= POLLIN | POLLRDNORM;
+		ret |= EPOLLIN | EPOLLRDNORM;
 	if (!will_write_block(port))
-		ret |= POLLOUT;
+		ret |= EPOLLOUT;
 	if (!port->host_connected)
-		ret |= POLLHUP;
+		ret |= EPOLLHUP;
 
 	return ret;
 }
diff --git a/drivers/char/xillybus/xillybus_core.c b/drivers/char/xillybus/xillybus_core.c
index 88e1cf475d3f5..a11af94e2e650 100644
--- a/drivers/char/xillybus/xillybus_core.c
+++ b/drivers/char/xillybus/xillybus_core.c
@@ -1758,15 +1758,15 @@ static __poll_t xillybus_poll(struct file *filp, poll_table *wait)
 
 		spin_lock_irqsave(&channel->wr_spinlock, flags);
 		if (!channel->wr_empty || channel->wr_ready)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 
 		if (channel->wr_hangup)
 			/*
-			 * Not POLLHUP, because its behavior is in the
-			 * mist, and POLLIN does what we want: Wake up
+			 * Not EPOLLHUP, because its behavior is in the
+			 * mist, and EPOLLIN does what we want: Wake up
 			 * the read file descriptor so it sees EOF.
 			 */
-			mask |=  POLLIN | POLLRDNORM;
+			mask |=  EPOLLIN | EPOLLRDNORM;
 		spin_unlock_irqrestore(&channel->wr_spinlock, flags);
 	}
 
@@ -1781,12 +1781,12 @@ static __poll_t xillybus_poll(struct file *filp, poll_table *wait)
 
 		spin_lock_irqsave(&channel->rd_spinlock, flags);
 		if (!channel->rd_full)
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 		spin_unlock_irqrestore(&channel->rd_spinlock, flags);
 	}
 
 	if (channel->endpoint->fatal_error)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 
 	return mask;
 }
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 5394507138381..d78d5fc173dc3 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -135,10 +135,10 @@ static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
  * Userspace can query the state of these implicitly tracked fences using poll()
  * and related system calls:
  *
- * - Checking for POLLIN, i.e. read access, can be use to query the state of the
+ * - Checking for EPOLLIN, i.e. read access, can be use to query the state of the
  *   most recent write or exclusive fence.
  *
- * - Checking for POLLOUT, i.e. write access, can be used to query the state of
+ * - Checking for EPOLLOUT, i.e. write access, can be used to query the state of
  *   all attached fences, shared and exclusive ones.
  *
  * Note that this only signals the completion of the respective fences, i.e. the
@@ -168,13 +168,13 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 
 	dmabuf = file->private_data;
 	if (!dmabuf || !dmabuf->resv)
-		return POLLERR;
+		return EPOLLERR;
 
 	resv = dmabuf->resv;
 
 	poll_wait(file, &dmabuf->poll, poll);
 
-	events = poll_requested_events(poll) & (POLLIN | POLLOUT);
+	events = poll_requested_events(poll) & (EPOLLIN | EPOLLOUT);
 	if (!events)
 		return 0;
 
@@ -193,12 +193,12 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 		goto retry;
 	}
 
-	if (fence_excl && (!(events & POLLOUT) || shared_count == 0)) {
+	if (fence_excl && (!(events & EPOLLOUT) || shared_count == 0)) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_excl;
-		__poll_t pevents = POLLIN;
+		__poll_t pevents = EPOLLIN;
 
 		if (shared_count == 0)
-			pevents |= POLLOUT;
+			pevents |= EPOLLOUT;
 
 		spin_lock_irq(&dmabuf->poll.lock);
 		if (dcb->active) {
@@ -228,19 +228,19 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 		}
 	}
 
-	if ((events & POLLOUT) && shared_count > 0) {
+	if ((events & EPOLLOUT) && shared_count > 0) {
 		struct dma_buf_poll_cb_t *dcb = &dmabuf->cb_shared;
 		int i;
 
 		/* Only queue a new callback if no event has fired yet */
 		spin_lock_irq(&dmabuf->poll.lock);
 		if (dcb->active)
-			events &= ~POLLOUT;
+			events &= ~EPOLLOUT;
 		else
-			dcb->active = POLLOUT;
+			dcb->active = EPOLLOUT;
 		spin_unlock_irq(&dmabuf->poll.lock);
 
-		if (!(events & POLLOUT))
+		if (!(events & EPOLLOUT))
 			goto out;
 
 		for (i = 0; i < shared_count; ++i) {
@@ -253,14 +253,14 @@ static __poll_t dma_buf_poll(struct file *file, poll_table *poll)
 				 *
 				 * call dma_buf_poll_cb and force a recheck!
 				 */
-				events &= ~POLLOUT;
+				events &= ~EPOLLOUT;
 				dma_buf_poll_cb(NULL, &dcb->cb);
 				break;
 			}
 			if (!dma_fence_add_callback(fence, &dcb->cb,
 						    dma_buf_poll_cb)) {
 				dma_fence_put(fence);
-				events &= ~POLLOUT;
+				events &= ~EPOLLOUT;
 				break;
 			}
 			dma_fence_put(fence);
diff --git a/drivers/dma-buf/sync_file.c b/drivers/dma-buf/sync_file.c
index 8e8c4a12a0bc0..35dd06479867f 100644
--- a/drivers/dma-buf/sync_file.c
+++ b/drivers/dma-buf/sync_file.c
@@ -325,7 +325,7 @@ static __poll_t sync_file_poll(struct file *file, poll_table *wait)
 			wake_up_all(&sync_file->wq);
 	}
 
-	return dma_fence_is_signaled(sync_file->fence) ? POLLIN : 0;
+	return dma_fence_is_signaled(sync_file->fence) ? EPOLLIN : 0;
 }
 
 static long sync_file_ioctl_merge(struct sync_file *sync_file,
diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 523391bb3fbe3..f0587273940e4 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -1792,9 +1792,9 @@ static __poll_t fw_device_op_poll(struct file *file, poll_table * pt)
 	poll_wait(file, &client->wait, pt);
 
 	if (fw_device_is_shutdown(client->device))
-		mask |= POLLHUP | POLLERR;
+		mask |= EPOLLHUP | EPOLLERR;
 	if (!list_empty(&client->event_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/firewire/nosy.c b/drivers/firewire/nosy.c
index fee2e9e7ea20a..a128dd1126ae4 100644
--- a/drivers/firewire/nosy.c
+++ b/drivers/firewire/nosy.c
@@ -337,10 +337,10 @@ nosy_poll(struct file *file, poll_table *pt)
 	poll_wait(file, &client->buffer.wait, pt);
 
 	if (atomic_read(&client->buffer.size) > 0)
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 
 	if (list_empty(&client->lynx->link))
-		ret |= POLLHUP;
+		ret |= EPOLLHUP;
 
 	return ret;
 }
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 36ca5064486e8..d66de67ef307c 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -630,7 +630,7 @@ static __poll_t lineevent_poll(struct file *filep,
 	poll_wait(filep, &le->wait, wait);
 
 	if (!kfifo_is_empty(&le->events))
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 
 	return events;
 }
@@ -775,7 +775,7 @@ static irqreturn_t lineevent_irq_thread(int irq, void *p)
 
 	ret = kfifo_put(&le->events, ge);
 	if (ret != 0)
-		wake_up_poll(&le->wait, POLLIN);
+		wake_up_poll(&le->wait, EPOLLIN);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 9a17bd3639d12..e394799979a6e 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -567,7 +567,7 @@ __poll_t drm_poll(struct file *filp, struct poll_table_struct *wait)
 	poll_wait(filp, &file_priv->event_wait, wait);
 
 	if (!list_empty(&file_priv->event_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index e42d9a4de322e..0be50e43507de 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -244,7 +244,7 @@
  * The two separate pointers let us decouple read()s from tail pointer aging.
  *
  * The tail pointers are checked and updated at a limited rate within a hrtimer
- * callback (the same callback that is used for delivering POLLIN events)
+ * callback (the same callback that is used for delivering EPOLLIN events)
  *
  * Initially the tails are marked invalid with %INVALID_TAIL_PTR which
  * indicates that an updated tail pointer is needed.
@@ -2292,13 +2292,13 @@ static ssize_t i915_perf_read(struct file *file,
 		mutex_unlock(&dev_priv->perf.lock);
 	}
 
-	/* We allow the poll checking to sometimes report false positive POLLIN
+	/* We allow the poll checking to sometimes report false positive EPOLLIN
 	 * events where we might actually report EAGAIN on read() if there's
 	 * not really any data available. In this situation though we don't
-	 * want to enter a busy loop between poll() reporting a POLLIN event
+	 * want to enter a busy loop between poll() reporting a EPOLLIN event
 	 * and read() returning -EAGAIN. Clearing the oa.pollin state here
 	 * effectively ensures we back off until the next hrtimer callback
-	 * before reporting another POLLIN event.
+	 * before reporting another EPOLLIN event.
 	 */
 	if (ret >= 0 || ret == -EAGAIN) {
 		/* Maybe make ->pollin per-stream state if we support multiple
@@ -2358,7 +2358,7 @@ static __poll_t i915_perf_poll_locked(struct drm_i915_private *dev_priv,
 	 * samples to read.
 	 */
 	if (dev_priv->perf.oa.pollin)
-		events |= POLLIN;
+		events |= EPOLLIN;
 
 	return events;
 }
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index dfd8d0048980a..1c5e74cb9279b 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -1271,7 +1271,7 @@ static __poll_t vga_arb_fpoll(struct file *file, poll_table *wait)
 	pr_debug("%s\n", __func__);
 
 	poll_wait(file, &vga_wait_queue, wait);
-	return POLLIN;
+	return EPOLLIN;
 }
 
 static int vga_arb_open(struct inode *inode, struct file *file)
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index c783fd5ef8096..4f4e7a08a07be 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -1185,9 +1185,9 @@ static __poll_t hid_debug_events_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &list->hdev->debug_wait, wait);
 	if (list->head != list->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	if (!list->hdev->debug)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	return 0;
 }
 
diff --git a/drivers/hid/hid-roccat.c b/drivers/hid/hid-roccat.c
index b7e86aba6f337..5be8de70c6517 100644
--- a/drivers/hid/hid-roccat.c
+++ b/drivers/hid/hid-roccat.c
@@ -142,9 +142,9 @@ static __poll_t roccat_poll(struct file *file, poll_table *wait)
 	struct roccat_reader *reader = file->private_data;
 	poll_wait(file, &reader->device->wait, wait);
 	if (reader->cbuf_start != reader->device->cbuf_end)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	if (!reader->device->exist)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	return 0;
 }
 
diff --git a/drivers/hid/hid-sensor-custom.c b/drivers/hid/hid-sensor-custom.c
index 21ed6c55c40a4..e8a114157f87b 100644
--- a/drivers/hid/hid-sensor-custom.c
+++ b/drivers/hid/hid-sensor-custom.c
@@ -714,7 +714,7 @@ static __poll_t hid_sensor_custom_poll(struct file *file,
 	poll_wait(file, &sensor_inst->wait, wait);
 
 	if (!kfifo_is_empty(&sensor_inst->data_fifo))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index be210219f9829..fbfcc80094329 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -255,9 +255,9 @@ static __poll_t hidraw_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &list->hidraw->wait, wait);
 	if (list->head != list->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	if (!list->hidraw->exist)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	return 0;
 }
 
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index fc43850a155ee..4e0e7baf85136 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -760,7 +760,7 @@ static __poll_t uhid_char_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &uhid->waitq, wait);
 
 	if (uhid->head != uhid->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 0ff3e7e70c8df..e3ce233f8bdcc 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -428,9 +428,9 @@ static __poll_t hiddev_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &list->hiddev->wait, wait);
 	if (list->head != list->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	if (!list->hiddev->exist)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	return 0;
 }
 
diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c
index 8fbbacb0fe21e..324cb8ec94050 100644
--- a/drivers/hsi/clients/cmt_speech.c
+++ b/drivers/hsi/clients/cmt_speech.c
@@ -1132,9 +1132,9 @@ static __poll_t cs_char_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &cs_char_data.wait, wait);
 	spin_lock_bh(&csdata->lock);
 	if (!list_empty(&csdata->chardev_queue))
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 	else if (!list_empty(&csdata->dataind_queue))
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 	spin_unlock_bh(&csdata->lock);
 
 	return ret;
diff --git a/drivers/hv/hv_utils_transport.c b/drivers/hv/hv_utils_transport.c
index 047959e74bb10..8327775279362 100644
--- a/drivers/hv/hv_utils_transport.c
+++ b/drivers/hv/hv_utils_transport.c
@@ -113,10 +113,10 @@ static __poll_t hvt_op_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &hvt->outmsg_q, wait);
 
 	if (hvt->mode == HVUTIL_TRANSPORT_DESTROY)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	if (hvt->outmsg_len > 0)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index ff03324dee132..05e0c353e0898 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -222,7 +222,7 @@ void iio_dma_buffer_block_done(struct iio_dma_buffer_block *block)
 	spin_unlock_irqrestore(&queue->list_lock, flags);
 
 	iio_buffer_block_put_atomic(block);
-	wake_up_interruptible_poll(&queue->buffer.pollq, POLLIN | POLLRDNORM);
+	wake_up_interruptible_poll(&queue->buffer.pollq, EPOLLIN | EPOLLRDNORM);
 }
 EXPORT_SYMBOL_GPL(iio_dma_buffer_block_done);
 
@@ -251,7 +251,7 @@ void iio_dma_buffer_block_list_abort(struct iio_dma_buffer_queue *queue,
 	}
 	spin_unlock_irqrestore(&queue->list_lock, flags);
 
-	wake_up_interruptible_poll(&queue->buffer.pollq, POLLIN | POLLRDNORM);
+	wake_up_interruptible_poll(&queue->buffer.pollq, EPOLLIN | EPOLLRDNORM);
 }
 EXPORT_SYMBOL_GPL(iio_dma_buffer_block_list_abort);
 
diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
index 6184c100a94a5..79abf70a126dd 100644
--- a/drivers/iio/industrialio-buffer.c
+++ b/drivers/iio/industrialio-buffer.c
@@ -166,7 +166,7 @@ ssize_t iio_buffer_read_first_n_outer(struct file *filp, char __user *buf,
  * @wait:	Poll table structure pointer for which the driver adds
  *		a wait queue
  *
- * Return: (POLLIN | POLLRDNORM) if data is available for reading
+ * Return: (EPOLLIN | EPOLLRDNORM) if data is available for reading
  *	   or 0 for other cases
  */
 __poll_t iio_buffer_poll(struct file *filp,
@@ -180,7 +180,7 @@ __poll_t iio_buffer_poll(struct file *filp,
 
 	poll_wait(filp, &rb->pollq, wait);
 	if (iio_buffer_ready(indio_dev, rb, rb->watermark, 0))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
@@ -1396,7 +1396,7 @@ static int iio_push_to_buffer(struct iio_buffer *buffer, const void *data)
 	 * We can't just test for watermark to decide if we wake the poll queue
 	 * because read may request less samples than the watermark.
 	 */
-	wake_up_interruptible_poll(&buffer->pollq, POLLIN | POLLRDNORM);
+	wake_up_interruptible_poll(&buffer->pollq, EPOLLIN | EPOLLRDNORM);
 	return 0;
 }
 
diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c
index 0bcf073e46dbe..c6dfdf0aaac51 100644
--- a/drivers/iio/industrialio-event.c
+++ b/drivers/iio/industrialio-event.c
@@ -80,7 +80,7 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp)
 
 		copied = kfifo_put(&ev_int->det_events, ev);
 		if (copied != 0)
-			wake_up_poll(&ev_int->wait, POLLIN);
+			wake_up_poll(&ev_int->wait, EPOLLIN);
 	}
 
 	return 0;
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(iio_push_event);
  * @filep:	File structure pointer to identify the device
  * @wait:	Poll table pointer to add the wait queue on
  *
- * Return: (POLLIN | POLLRDNORM) if data is available for reading
+ * Return: (EPOLLIN | EPOLLRDNORM) if data is available for reading
  *	   or a negative error code on failure
  */
 static __poll_t iio_event_poll(struct file *filep,
@@ -108,7 +108,7 @@ static __poll_t iio_event_poll(struct file *filep,
 	poll_wait(filep, &ev_int->wait, wait);
 
 	if (!kfifo_is_empty(&ev_int->det_events))
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 
 	return events;
 }
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 8ae636bb09e57..01702265c1e1a 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1144,7 +1144,7 @@ static __poll_t ib_ucm_poll(struct file *filp,
 	poll_wait(filp, &file->poll_wait, wait);
 
 	if (!list_empty(&file->events))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 6ba4231f2b073..f015f1bf88c9c 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1639,7 +1639,7 @@ static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait)
 	poll_wait(filp, &file->poll_wait, wait);
 
 	if (!list_empty(&file->event_list))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 78c77962422e0..bb98c9e4a7fd2 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -633,12 +633,12 @@ static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
 	struct ib_umad_file *file = filp->private_data;
 
 	/* we will always be able to post a MAD send */
-	__poll_t mask = POLLOUT | POLLWRNORM;
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
 
 	poll_wait(filp, &file->recv_wait, wait);
 
 	if (!list_empty(&file->recv_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5b811bf574d69..395a3b091229f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -351,7 +351,7 @@ static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue,
 
 	spin_lock_irq(&ev_queue->lock);
 	if (!list_empty(&ev_queue->event_list))
-		pollflags = POLLIN | POLLRDNORM;
+		pollflags = EPOLLIN | EPOLLRDNORM;
 	spin_unlock_irq(&ev_queue->lock);
 
 	return pollflags;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index d9a0f2590294b..41fafebe3b0d9 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -612,13 +612,13 @@ static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt)
 
 	uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
 	if (!uctxt)
-		pollflag = POLLERR;
+		pollflag = EPOLLERR;
 	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
 		pollflag = poll_urgent(fp, pt);
 	else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
 		pollflag = poll_next(fp, pt);
 	else /* invalid */
-		pollflag = POLLERR;
+		pollflag = EPOLLERR;
 
 	return pollflag;
 }
@@ -1435,7 +1435,7 @@ static __poll_t poll_urgent(struct file *fp,
 
 	spin_lock_irq(&dd->uctxt_lock);
 	if (uctxt->urgent != uctxt->urgent_poll) {
-		pollflag = POLLIN | POLLRDNORM;
+		pollflag = EPOLLIN | EPOLLRDNORM;
 		uctxt->urgent_poll = uctxt->urgent;
 	} else {
 		pollflag = 0;
@@ -1462,7 +1462,7 @@ static __poll_t poll_next(struct file *fp,
 		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt);
 		pollflag = 0;
 	} else {
-		pollflag = POLLIN | POLLRDNORM;
+		pollflag = EPOLLIN | EPOLLRDNORM;
 	}
 	spin_unlock_irq(&dd->uctxt_lock);
 
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index f7593b5e2b761..52c29db3a2f4a 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1085,7 +1085,7 @@ static __poll_t qib_poll_urgent(struct qib_ctxtdata *rcd,
 
 	spin_lock_irq(&dd->uctxt_lock);
 	if (rcd->urgent != rcd->urgent_poll) {
-		pollflag = POLLIN | POLLRDNORM;
+		pollflag = EPOLLIN | EPOLLRDNORM;
 		rcd->urgent_poll = rcd->urgent;
 	} else {
 		pollflag = 0;
@@ -1111,7 +1111,7 @@ static __poll_t qib_poll_next(struct qib_ctxtdata *rcd,
 		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);
 		pollflag = 0;
 	} else
-		pollflag = POLLIN | POLLRDNORM;
+		pollflag = EPOLLIN | EPOLLRDNORM;
 	spin_unlock_irq(&dd->uctxt_lock);
 
 	return pollflag;
@@ -1124,13 +1124,13 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
 
 	rcd = ctxt_fp(fp);
 	if (!rcd)
-		pollflag = POLLERR;
+		pollflag = EPOLLERR;
 	else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)
 		pollflag = qib_poll_urgent(rcd, fp, pt);
 	else  if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)
 		pollflag = qib_poll_next(rcd, fp, pt);
 	else /* invalid */
-		pollflag = POLLERR;
+		pollflag = EPOLLERR;
 
 	return pollflag;
 }
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 19624e023ebd9..0336643c2ed65 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -874,7 +874,7 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
 	iser_info("iser conn %p rc = %d\n", iser_conn, rc);
 
 	if (rc > 0)
-		return 1; /* success, this is the equivalent of POLLOUT */
+		return 1; /* success, this is the equivalent of EPOLLOUT */
 	else if (!rc)
 		return 0; /* timeout */
 	else
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 94049fdc583cc..c81c79d01d930 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -650,12 +650,12 @@ static __poll_t evdev_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &evdev->wait, wait);
 
 	if (evdev->exist && !client->revoked)
-		mask = POLLOUT | POLLWRNORM;
+		mask = EPOLLOUT | EPOLLWRNORM;
 	else
-		mask = POLLHUP | POLLERR;
+		mask = EPOLLHUP | EPOLLERR;
 
 	if (client->packet_head != client->tail)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 0d0b2ab1bb6bc..9785546420a72 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1053,7 +1053,7 @@ static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &input_devices_poll_wait, wait);
 	if (file->f_version != input_devices_state) {
 		file->f_version = input_devices_state;
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index fe3255572886c..4c1e427dfabb9 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -442,8 +442,8 @@ static __poll_t joydev_poll(struct file *file, poll_table *wait)
 	struct joydev *joydev = client->joydev;
 
 	poll_wait(file, &joydev->wait, wait);
-	return (joydev_data_pending(client) ? (POLLIN | POLLRDNORM) : 0) |
-		(joydev->exist ?  0 : (POLLHUP | POLLERR));
+	return (joydev_data_pending(client) ? (EPOLLIN | EPOLLRDNORM) : 0) |
+		(joydev->exist ?  0 : (EPOLLHUP | EPOLLERR));
 }
 
 static int joydev_handle_JSIOCSAXMAP(struct joydev *joydev,
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index 9c3f7ec3bd3d6..49b34de0aed4d 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -414,7 +414,7 @@ static __poll_t hp_sdc_rtc_poll(struct file *file, poll_table *wait)
 
 	l = 0;
         if (l != 0)
-                return POLLIN | POLLRDNORM;
+                return EPOLLIN | EPOLLRDNORM;
         return 0;
 }
 
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index f640c591ef23f..96a887f336982 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -704,7 +704,7 @@ static __poll_t uinput_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &udev->waitq, wait);
 
 	if (udev->head != udev->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/input/mousedev.c b/drivers/input/mousedev.c
index 731d84ae51017..e08228061bcdd 100644
--- a/drivers/input/mousedev.c
+++ b/drivers/input/mousedev.c
@@ -765,9 +765,9 @@ static __poll_t mousedev_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &mousedev->wait, wait);
 
-	mask = mousedev->exist ? POLLOUT | POLLWRNORM : POLLHUP | POLLERR;
+	mask = mousedev->exist ? EPOLLOUT | EPOLLWRNORM : EPOLLHUP | EPOLLERR;
 	if (client->ready || client->buffer)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index fccf55a380b2f..17b7fbecd9fe7 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -247,9 +247,9 @@ static __poll_t serio_raw_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &serio_raw->wait, wait);
 
-	mask = serio_raw->dead ? POLLHUP | POLLERR : POLLOUT | POLLWRNORM;
+	mask = serio_raw->dead ? EPOLLHUP | EPOLLERR : EPOLLOUT | EPOLLWRNORM;
 	if (serio_raw->head != serio_raw->tail)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/input/serio/userio.c b/drivers/input/serio/userio.c
index a63de06b08bcc..9ab5c45c3a9fe 100644
--- a/drivers/input/serio/userio.c
+++ b/drivers/input/serio/userio.c
@@ -255,7 +255,7 @@ static __poll_t userio_char_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &userio->waitq, wait);
 
 	if (userio->head != userio->tail)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index e268811dc544b..19cd93783c87d 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -731,12 +731,12 @@ capi_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	if (!cdev->ap.applid)
-		return POLLERR;
+		return EPOLLERR;
 
 	poll_wait(file, &(cdev->recvwait), wait);
-	mask = POLLOUT | POLLWRNORM;
+	mask = EPOLLOUT | EPOLLWRNORM;
 	if (!skb_queue_empty(&cdev->recvqueue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	return mask;
 }
 
diff --git a/drivers/isdn/divert/divert_procfs.c b/drivers/isdn/divert/divert_procfs.c
index 34b7704042a4a..342585e04fd3f 100644
--- a/drivers/isdn/divert/divert_procfs.c
+++ b/drivers/isdn/divert/divert_procfs.c
@@ -125,9 +125,9 @@ isdn_divert_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	poll_wait(file, &(rd_queue), wait);
-	/* mask = POLLOUT | POLLWRNORM; */
+	/* mask = EPOLLOUT | EPOLLWRNORM; */
 	if (*((struct divert_info **) file->private_data)) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	return mask;
 }				/* isdn_divert_poll */
diff --git a/drivers/isdn/hardware/eicon/divamnt.c b/drivers/isdn/hardware/eicon/divamnt.c
index 70f16102a0010..5a95587b31172 100644
--- a/drivers/isdn/hardware/eicon/divamnt.c
+++ b/drivers/isdn/hardware/eicon/divamnt.c
@@ -103,9 +103,9 @@ static __poll_t maint_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	poll_wait(file, &msgwaitq, wait);
-	mask = POLLOUT | POLLWRNORM;
+	mask = EPOLLOUT | EPOLLWRNORM;
 	if (file->private_data || diva_dbg_q_length()) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	return (mask);
 }
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index da5cc5ab7e2de..525518c945fe6 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -370,31 +370,31 @@ static __poll_t um_idi_poll(struct file *file, poll_table *wait)
 	diva_um_idi_os_context_t *p_os;
 
 	if (!file->private_data) {
-		return (POLLERR);
+		return (EPOLLERR);
 	}
 
 	if ((!(p_os =
 	       (diva_um_idi_os_context_t *)
 	       diva_um_id_get_os_context(file->private_data)))
 	    || p_os->aborted) {
-		return (POLLERR);
+		return (EPOLLERR);
 	}
 
 	poll_wait(file, &p_os->read_wait, wait);
 
 	if (p_os->aborted) {
-		return (POLLERR);
+		return (EPOLLERR);
 	}
 
 	switch (diva_user_mode_idi_ind_ready(file->private_data, file)) {
 	case (-1):
-		return (POLLERR);
+		return (EPOLLERR);
 
 	case 0:
 		return (0);
 	}
 
-	return (POLLIN | POLLRDNORM);
+	return (EPOLLIN | EPOLLRDNORM);
 }
 
 static int um_idi_open(struct inode *inode, struct file *file)
diff --git a/drivers/isdn/hardware/eicon/divasmain.c b/drivers/isdn/hardware/eicon/divasmain.c
index fbc788e6f0db9..b9980e84f9db5 100644
--- a/drivers/isdn/hardware/eicon/divasmain.c
+++ b/drivers/isdn/hardware/eicon/divasmain.c
@@ -653,9 +653,9 @@ static ssize_t divas_read(struct file *file, char __user *buf,
 static __poll_t divas_poll(struct file *file, poll_table *wait)
 {
 	if (!file->private_data) {
-		return (POLLERR);
+		return (EPOLLERR);
 	}
-	return (POLLIN | POLLRDNORM);
+	return (EPOLLIN | EPOLLRDNORM);
 }
 
 static const struct file_operations divas_fops = {
diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c
index 3478f6f099eba..f52f4622b10b0 100644
--- a/drivers/isdn/hardware/eicon/divasproc.c
+++ b/drivers/isdn/hardware/eicon/divasproc.c
@@ -101,7 +101,7 @@ divas_write(struct file *file, const char __user *buf, size_t count, loff_t *off
 
 static __poll_t divas_poll(struct file *file, poll_table *wait)
 {
-	return (POLLERR);
+	return (EPOLLERR);
 }
 
 static int divas_open(struct inode *inode, struct file *file)
diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c
index 6abea6915f494..6e898b90e86e6 100644
--- a/drivers/isdn/hysdn/hysdn_proclog.c
+++ b/drivers/isdn/hysdn/hysdn_proclog.c
@@ -294,7 +294,7 @@ hysdn_log_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &(pd->rd_queue), wait);
 
 	if (*((struct log_data **) file->private_data))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }				/* hysdn_log_poll */
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 0521c32949d47..7c6f3f5d9d9a2 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -1237,22 +1237,22 @@ isdn_poll(struct file *file, poll_table *wait)
 	mutex_lock(&isdn_mutex);
 	if (minor == ISDN_MINOR_STATUS) {
 		poll_wait(file, &(dev->info_waitq), wait);
-		/* mask = POLLOUT | POLLWRNORM; */
+		/* mask = EPOLLOUT | EPOLLWRNORM; */
 		if (file->private_data) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		}
 		goto out;
 	}
 	if (minor >= ISDN_MINOR_CTRL && minor <= ISDN_MINOR_CTRLMAX) {
 		if (drvidx < 0) {
 			/* driver deregistered while file open */
-			mask = POLLHUP;
+			mask = EPOLLHUP;
 			goto out;
 		}
 		poll_wait(file, &(dev->drv[drvidx]->st_waitq), wait);
-		mask = POLLOUT | POLLWRNORM;
+		mask = EPOLLOUT | EPOLLWRNORM;
 		if (dev->drv[drvidx]->stavail) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		}
 		goto out;
 	}
@@ -1262,7 +1262,7 @@ isdn_poll(struct file *file, poll_table *wait)
 		goto out;
 	}
 #endif
-	mask = POLLERR;
+	mask = EPOLLERR;
 out:
 	mutex_unlock(&isdn_mutex);
 	return mask;
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index 57884319b4b13..a7b275ea5de1d 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -704,12 +704,12 @@ isdn_ppp_poll(struct file *file, poll_table *wait)
 
 	if (!(is->state & IPPP_OPEN)) {
 		if (is->state == IPPP_CLOSEWAIT)
-			return POLLHUP;
+			return EPOLLHUP;
 		printk(KERN_DEBUG "isdn_ppp: device not open\n");
-		return POLLERR;
+		return EPOLLERR;
 	}
 	/* we're always ready to send .. */
-	mask = POLLOUT | POLLWRNORM;
+	mask = EPOLLOUT | EPOLLWRNORM;
 
 	spin_lock_irqsave(&is->buflock, flags);
 	bl = is->last;
@@ -719,7 +719,7 @@ isdn_ppp_poll(struct file *file, poll_table *wait)
 	 */
 	if (bf->next != bl || (is->state & IPPP_NOBLOCK)) {
 		is->state &= ~IPPP_NOBLOCK;
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	spin_unlock_irqrestore(&is->buflock, flags);
 	return mask;
diff --git a/drivers/isdn/mISDN/timerdev.c b/drivers/isdn/mISDN/timerdev.c
index f4272d4e0a26c..211ed6cffd10e 100644
--- a/drivers/isdn/mISDN/timerdev.c
+++ b/drivers/isdn/mISDN/timerdev.c
@@ -145,7 +145,7 @@ static __poll_t
 mISDN_poll(struct file *filep, poll_table *wait)
 {
 	struct mISDNtimerdev	*dev = filep->private_data;
-	__poll_t		mask = POLLERR;
+	__poll_t		mask = EPOLLERR;
 
 	if (*debug & DEBUG_TIMER)
 		printk(KERN_DEBUG "%s(%p, %p)\n", __func__, filep, wait);
@@ -153,7 +153,7 @@ mISDN_poll(struct file *filep, poll_table *wait)
 		poll_wait(filep, &dev->wait, wait);
 		mask = 0;
 		if (dev->work || !list_empty(&dev->expired))
-			mask |= (POLLIN | POLLRDNORM);
+			mask |= (EPOLLIN | EPOLLRDNORM);
 		if (*debug & DEBUG_TIMER)
 			printk(KERN_DEBUG "%s work(%d) empty(%d)\n", __func__,
 			       dev->work, list_empty(&dev->expired));
diff --git a/drivers/leds/uleds.c b/drivers/leds/uleds.c
index 5beacab05ed74..0c43bfac9598e 100644
--- a/drivers/leds/uleds.c
+++ b/drivers/leds/uleds.c
@@ -183,7 +183,7 @@ static __poll_t uleds_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &udev->waitq, wait);
 
 	if (udev->new_data)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 346e6f5f77be7..e8ae2e54151cc 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -1259,7 +1259,7 @@ static __poll_t smu_fpoll(struct file *file, poll_table *wait)
 
 		spin_lock_irqsave(&pp->lock, flags);
 		if (pp->busy && pp->cmd.status != 1)
-			mask |= POLLIN;
+			mask |= EPOLLIN;
 		spin_unlock_irqrestore(&pp->lock, flags);
 	}
 	if (pp->mode == smu_file_events) {
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 08849e33c5679..94c0f3f7df699 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2169,7 +2169,7 @@ pmu_fpoll(struct file *filp, poll_table *wait)
 	poll_wait(filp, &pp->wait, wait);
 	spin_lock_irqsave(&pp->lock, flags);
 	if (pp->rb_get != pp->rb_put)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 	spin_unlock_irqrestore(&pp->lock, flags);
 	return mask;
 }
diff --git a/drivers/mailbox/mailbox-test.c b/drivers/mailbox/mailbox-test.c
index f84730d63b1f2..58bfafc34bc46 100644
--- a/drivers/mailbox/mailbox-test.c
+++ b/drivers/mailbox/mailbox-test.c
@@ -243,7 +243,7 @@ mbox_test_message_poll(struct file *filp, struct poll_table_struct *wait)
 	poll_wait(filp, &tdev->waitq, wait);
 
 	if (mbox_test_message_data_ready(tdev))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 3f6791afd3e45..a89fd8f44453e 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1937,7 +1937,7 @@ static __poll_t dm_poll(struct file *filp, poll_table *wait)
 	poll_wait(filp, &dm_global_eventq, wait);
 
 	if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	return mask;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0081ace39a649..bc67ab6844f02 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7891,14 +7891,14 @@ static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
 	__poll_t mask;
 
 	if (md_unloading)
-		return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+		return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
 	poll_wait(filp, &md_event_waiters, wait);
 
 	/* always allow read */
-	mask = POLLIN | POLLRDNORM;
+	mask = EPOLLIN | EPOLLRDNORM;
 
 	if (seq->poll_event != atomic_read(&md_event_count))
-		mask |= POLLERR | POLLPRI;
+		mask |= EPOLLERR | EPOLLPRI;
 	return mask;
 }
 
diff --git a/drivers/media/cec/cec-api.c b/drivers/media/cec/cec-api.c
index ecc89d9a279b2..492db12b8c4dc 100644
--- a/drivers/media/cec/cec-api.c
+++ b/drivers/media/cec/cec-api.c
@@ -51,15 +51,15 @@ static __poll_t cec_poll(struct file *filp,
 	__poll_t res = 0;
 
 	if (!cec_is_registered(adap))
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	mutex_lock(&adap->lock);
 	if (adap->is_configured &&
 	    adap->transmit_queue_sz < CEC_MAX_MSG_TX_QUEUE_SZ)
-		res |= POLLOUT | POLLWRNORM;
+		res |= EPOLLOUT | EPOLLWRNORM;
 	if (fh->queued_msgs)
-		res |= POLLIN | POLLRDNORM;
+		res |= EPOLLIN | EPOLLRDNORM;
 	if (fh->total_queued_events)
-		res |= POLLPRI;
+		res |= EPOLLPRI;
 	poll_wait(filp, &fh->wait, poll);
 	mutex_unlock(&adap->lock);
 	return res;
diff --git a/drivers/media/common/saa7146/saa7146_fops.c b/drivers/media/common/saa7146/saa7146_fops.c
index 8ee3eebef4dbd..d4987fd05d05f 100644
--- a/drivers/media/common/saa7146/saa7146_fops.c
+++ b/drivers/media/common/saa7146/saa7146_fops.c
@@ -332,7 +332,7 @@ static __poll_t __fops_poll(struct file *file, struct poll_table_struct *wait)
 
 	if (vdev->vfl_type == VFL_TYPE_VBI) {
 		if (fh->dev->ext_vv_data->capabilities & V4L2_CAP_SLICED_VBI_OUTPUT)
-			return res | POLLOUT | POLLWRNORM;
+			return res | EPOLLOUT | EPOLLWRNORM;
 		if( 0 == fh->vbi_q.streaming )
 			return res | videobuf_poll_stream(file, &fh->vbi_q, wait);
 		q = &fh->vbi_q;
@@ -346,13 +346,13 @@ static __poll_t __fops_poll(struct file *file, struct poll_table_struct *wait)
 
 	if (!buf) {
 		DEB_D("buf == NULL!\n");
-		return res | POLLERR;
+		return res | EPOLLERR;
 	}
 
 	poll_wait(file, &buf->done, wait);
 	if (buf->state == VIDEOBUF_DONE || buf->state == VIDEOBUF_ERROR) {
 		DEB_D("poll succeeded!\n");
-		return res | POLLIN | POLLRDNORM;
+		return res | EPOLLIN | EPOLLRDNORM;
 	}
 
 	DEB_D("nothing to poll for, buf->state:%d\n", buf->state);
diff --git a/drivers/media/common/siano/smsdvb-debugfs.c b/drivers/media/common/siano/smsdvb-debugfs.c
index 403645fe90795..40891f4f842b9 100644
--- a/drivers/media/common/siano/smsdvb-debugfs.c
+++ b/drivers/media/common/siano/smsdvb-debugfs.c
@@ -371,7 +371,7 @@ static __poll_t smsdvb_stats_poll(struct file *file, poll_table *wait)
 	rc = smsdvb_stats_wait_read(debug_data);
 	kref_put(&debug_data->refcount, smsdvb_debugfs_data_release);
 
-	return rc > 0 ? POLLIN | POLLRDNORM : 0;
+	return rc > 0 ? EPOLLIN | EPOLLRDNORM : 0;
 }
 
 static ssize_t smsdvb_stats_read(struct file *file, char __user *user_buf,
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 9a84c70927145..debe35fc66b41 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -2038,9 +2038,9 @@ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file,
 	struct vb2_buffer *vb = NULL;
 	unsigned long flags;
 
-	if (!q->is_output && !(req_events & (POLLIN | POLLRDNORM)))
+	if (!q->is_output && !(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return 0;
-	if (q->is_output && !(req_events & (POLLOUT | POLLWRNORM)))
+	if (q->is_output && !(req_events & (EPOLLOUT | EPOLLWRNORM)))
 		return 0;
 
 	/*
@@ -2048,18 +2048,18 @@ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file,
 	 */
 	if (q->num_buffers == 0 && !vb2_fileio_is_active(q)) {
 		if (!q->is_output && (q->io_modes & VB2_READ) &&
-				(req_events & (POLLIN | POLLRDNORM))) {
+				(req_events & (EPOLLIN | EPOLLRDNORM))) {
 			if (__vb2_init_fileio(q, 1))
-				return POLLERR;
+				return EPOLLERR;
 		}
 		if (q->is_output && (q->io_modes & VB2_WRITE) &&
-				(req_events & (POLLOUT | POLLWRNORM))) {
+				(req_events & (EPOLLOUT | EPOLLWRNORM))) {
 			if (__vb2_init_fileio(q, 0))
-				return POLLERR;
+				return EPOLLERR;
 			/*
 			 * Write to OUTPUT queue can be done immediately.
 			 */
-			return POLLOUT | POLLWRNORM;
+			return EPOLLOUT | EPOLLWRNORM;
 		}
 	}
 
@@ -2068,24 +2068,24 @@ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file,
 	 * error flag is set.
 	 */
 	if (!vb2_is_streaming(q) || q->error)
-		return POLLERR;
+		return EPOLLERR;
 
 	/*
 	 * If this quirk is set and QBUF hasn't been called yet then
-	 * return POLLERR as well. This only affects capture queues, output
+	 * return EPOLLERR as well. This only affects capture queues, output
 	 * queues will always initialize waiting_for_buffers to false.
 	 * This quirk is set by V4L2 for backwards compatibility reasons.
 	 */
 	if (q->quirk_poll_must_check_waiting_for_buffers &&
-	    q->waiting_for_buffers && (req_events & (POLLIN | POLLRDNORM)))
-		return POLLERR;
+	    q->waiting_for_buffers && (req_events & (EPOLLIN | EPOLLRDNORM)))
+		return EPOLLERR;
 
 	/*
 	 * For output streams you can call write() as long as there are fewer
 	 * buffers queued than there are buffers available.
 	 */
 	if (q->is_output && q->fileio && q->queued_count < q->num_buffers)
-		return POLLOUT | POLLWRNORM;
+		return EPOLLOUT | EPOLLWRNORM;
 
 	if (list_empty(&q->done_list)) {
 		/*
@@ -2093,7 +2093,7 @@ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file,
 		 * return immediately. DQBUF will return -EPIPE.
 		 */
 		if (q->last_buffer_dequeued)
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 
 		poll_wait(file, &q->done_wq, wait);
 	}
@@ -2110,8 +2110,8 @@ __poll_t vb2_core_poll(struct vb2_queue *q, struct file *file,
 	if (vb && (vb->state == VB2_BUF_STATE_DONE
 			|| vb->state == VB2_BUF_STATE_ERROR)) {
 		return (q->is_output) ?
-				POLLOUT | POLLWRNORM :
-				POLLIN | POLLRDNORM;
+				EPOLLOUT | EPOLLWRNORM :
+				EPOLLIN | EPOLLRDNORM;
 	}
 	return 0;
 }
diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index d9a487aab99c9..886a2d8d5c6c4 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -658,7 +658,7 @@ int vb2_queue_init(struct vb2_queue *q)
 			== V4L2_BUF_FLAG_TIMESTAMP_COPY;
 	/*
 	 * For compatibility with vb1: if QBUF hasn't been called yet, then
-	 * return POLLERR as well. This only affects capture queues, output
+	 * return EPOLLERR as well. This only affects capture queues, output
 	 * queues will always initialize waiting_for_buffers to false.
 	 */
 	q->quirk_poll_must_check_waiting_for_buffers = true;
@@ -683,8 +683,8 @@ __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait)
 		struct v4l2_fh *fh = file->private_data;
 
 		if (v4l2_event_pending(fh))
-			res = POLLPRI;
-		else if (req_events & POLLPRI)
+			res = EPOLLPRI;
+		else if (req_events & EPOLLPRI)
 			poll_wait(file, &fh->wait, wait);
 	}
 
@@ -921,7 +921,7 @@ __poll_t vb2_fop_poll(struct file *file, poll_table *wait)
 	WARN_ON(!lock);
 
 	if (lock && mutex_lock_interruptible(lock))
-		return POLLERR;
+		return EPOLLERR;
 
 	fileio = q->fileio;
 
diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index bc198f84b9cd0..6d53af00190e3 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -1179,7 +1179,7 @@ static __poll_t dvb_demux_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	if ((!dmxdevfilter) || dmxdevfilter->dev->exit)
-		return POLLERR;
+		return EPOLLERR;
 	if (dvb_vb2_is_streaming(&dmxdevfilter->vb2_ctx))
 		return dvb_vb2_poll(&dmxdevfilter->vb2_ctx, file, wait);
 
@@ -1191,10 +1191,10 @@ static __poll_t dvb_demux_poll(struct file *file, poll_table *wait)
 		return 0;
 
 	if (dmxdevfilter->buffer.error)
-		mask |= (POLLIN | POLLRDNORM | POLLPRI | POLLERR);
+		mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR);
 
 	if (!dvb_ringbuffer_empty(&dmxdevfilter->buffer))
-		mask |= (POLLIN | POLLRDNORM | POLLPRI);
+		mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI);
 
 	return mask;
 }
@@ -1331,7 +1331,7 @@ static __poll_t dvb_dvr_poll(struct file *file, poll_table *wait)
 	dprintk("%s\n", __func__);
 
 	if (dmxdev->exit)
-		return POLLERR;
+		return EPOLLERR;
 	if (dvb_vb2_is_streaming(&dmxdev->dvr_vb2_ctx))
 		return dvb_vb2_poll(&dmxdev->dvr_vb2_ctx, file, wait);
 
@@ -1343,12 +1343,12 @@ static __poll_t dvb_dvr_poll(struct file *file, poll_table *wait)
 #endif
 	if (need_ringbuffer) {
 		if (dmxdev->dvr_buffer.error)
-			mask |= (POLLIN | POLLRDNORM | POLLPRI | POLLERR);
+			mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI | EPOLLERR);
 
 		if (!dvb_ringbuffer_empty(&dmxdev->dvr_buffer))
-			mask |= (POLLIN | POLLRDNORM | POLLPRI);
+			mask |= (EPOLLIN | EPOLLRDNORM | EPOLLPRI);
 	} else
-		mask |= (POLLOUT | POLLWRNORM | POLLPRI);
+		mask |= (EPOLLOUT | EPOLLWRNORM | EPOLLPRI);
 
 	return mask;
 }
diff --git a/drivers/media/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb-core/dvb_ca_en50221.c
index b462ebc0c544d..204d0f6c678da 100644
--- a/drivers/media/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb-core/dvb_ca_en50221.c
@@ -1796,7 +1796,7 @@ static __poll_t dvb_ca_en50221_io_poll(struct file *file, poll_table *wait)
 	dprintk("%s\n", __func__);
 
 	if (dvb_ca_en50221_io_read_condition(ca, &result, &slot) == 1)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	/* if there is something, return now */
 	if (mask)
@@ -1806,7 +1806,7 @@ static __poll_t dvb_ca_en50221_io_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &ca->wait_queue, wait);
 
 	if (dvb_ca_en50221_io_read_condition(ca, &result, &slot) == 1)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	return mask;
 }
diff --git a/drivers/media/dvb-core/dvb_frontend.c b/drivers/media/dvb-core/dvb_frontend.c
index 87fc1bcae5ae6..a7ed16e0841d5 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -2646,7 +2646,7 @@ static __poll_t dvb_frontend_poll(struct file *file, struct poll_table_struct *w
 	poll_wait (file, &fepriv->events.wait_queue, wait);
 
 	if (fepriv->events.eventw != fepriv->events.eventr)
-		return (POLLIN | POLLRDNORM | POLLPRI);
+		return (EPOLLIN | EPOLLRDNORM | EPOLLPRI);
 
 	return 0;
 }
diff --git a/drivers/media/firewire/firedtv-ci.c b/drivers/media/firewire/firedtv-ci.c
index b4ddfff742671..8dc5a7495abee 100644
--- a/drivers/media/firewire/firedtv-ci.c
+++ b/drivers/media/firewire/firedtv-ci.c
@@ -209,7 +209,7 @@ static int fdtv_ca_ioctl(struct file *file, unsigned int cmd, void *arg)
 
 static __poll_t fdtv_ca_io_poll(struct file *file, poll_table *wait)
 {
-	return POLLIN;
+	return EPOLLIN;
 }
 
 static const struct file_operations fdtv_ca_fops = {
diff --git a/drivers/media/i2c/saa6588.c b/drivers/media/i2c/saa6588.c
index 00640233a5e32..c3089bd34df25 100644
--- a/drivers/media/i2c/saa6588.c
+++ b/drivers/media/i2c/saa6588.c
@@ -413,7 +413,7 @@ static long saa6588_ioctl(struct v4l2_subdev *sd, unsigned int cmd, void *arg)
 	case SAA6588_CMD_POLL:
 		a->result = 0;
 		if (s->data_available_for_read)
-			a->result |= POLLIN | POLLRDNORM;
+			a->result |= EPOLLIN | EPOLLRDNORM;
 		poll_wait(a->instance, &s->read_queue, a->event_list);
 		break;
 
diff --git a/drivers/media/media-devnode.c b/drivers/media/media-devnode.c
index 3049b1f505e58..67ac51eff15c3 100644
--- a/drivers/media/media-devnode.c
+++ b/drivers/media/media-devnode.c
@@ -105,7 +105,7 @@ static __poll_t media_poll(struct file *filp,
 	struct media_devnode *devnode = media_devnode_data(filp);
 
 	if (!media_devnode_is_registered(devnode))
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 	if (!devnode->fops->poll)
 		return DEFAULT_POLLMASK;
 	return devnode->fops->poll(filp, poll);
diff --git a/drivers/media/pci/bt8xx/bttv-driver.c b/drivers/media/pci/bt8xx/bttv-driver.c
index c988669e22ff9..f697698fe38de 100644
--- a/drivers/media/pci/bt8xx/bttv-driver.c
+++ b/drivers/media/pci/bt8xx/bttv-driver.c
@@ -2964,39 +2964,39 @@ static __poll_t bttv_poll(struct file *file, poll_table *wait)
 	__poll_t req_events = poll_requested_events(wait);
 
 	if (v4l2_event_pending(&fh->fh))
-		rc = POLLPRI;
-	else if (req_events & POLLPRI)
+		rc = EPOLLPRI;
+	else if (req_events & EPOLLPRI)
 		poll_wait(file, &fh->fh.wait, wait);
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return rc;
 
 	if (V4L2_BUF_TYPE_VBI_CAPTURE == fh->type) {
 		if (!check_alloc_btres_lock(fh->btv,fh,RESOURCE_VBI))
-			return rc | POLLERR;
+			return rc | EPOLLERR;
 		return rc | videobuf_poll_stream(file, &fh->vbi, wait);
 	}
 
 	if (check_btres(fh,RESOURCE_VIDEO_STREAM)) {
 		/* streaming capture */
 		if (list_empty(&fh->cap.stream))
-			return rc | POLLERR;
+			return rc | EPOLLERR;
 		buf = list_entry(fh->cap.stream.next,struct bttv_buffer,vb.stream);
 	} else {
 		/* read() capture */
 		if (NULL == fh->cap.read_buf) {
 			/* need to capture a new frame */
 			if (locked_btres(fh->btv,RESOURCE_VIDEO_STREAM))
-				return rc | POLLERR;
+				return rc | EPOLLERR;
 			fh->cap.read_buf = videobuf_sg_alloc(fh->cap.msize);
 			if (NULL == fh->cap.read_buf)
-				return rc | POLLERR;
+				return rc | EPOLLERR;
 			fh->cap.read_buf->memory = V4L2_MEMORY_USERPTR;
 			field = videobuf_next_field(&fh->cap);
 			if (0 != fh->cap.ops->buf_prepare(&fh->cap,fh->cap.read_buf,field)) {
 				kfree (fh->cap.read_buf);
 				fh->cap.read_buf = NULL;
-				return rc | POLLERR;
+				return rc | EPOLLERR;
 			}
 			fh->cap.ops->buf_queue(&fh->cap,fh->cap.read_buf);
 			fh->cap.read_off = 0;
@@ -3007,7 +3007,7 @@ static __poll_t bttv_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &buf->vb.done, wait);
 	if (buf->vb.state == VIDEOBUF_DONE ||
 	    buf->vb.state == VIDEOBUF_ERROR)
-		rc = rc | POLLIN|POLLRDNORM;
+		rc = rc | EPOLLIN|EPOLLRDNORM;
 	return rc;
 }
 
@@ -3338,8 +3338,8 @@ static __poll_t radio_poll(struct file *file, poll_table *wait)
 	__poll_t res = 0;
 
 	if (v4l2_event_pending(&fh->fh))
-		res = POLLPRI;
-	else if (req_events & POLLPRI)
+		res = EPOLLPRI;
+	else if (req_events & EPOLLPRI)
 		poll_wait(file, &fh->fh.wait, wait);
 	radio_enable(btv);
 	cmd.instance = file;
diff --git a/drivers/media/pci/cx18/cx18-fileops.c b/drivers/media/pci/cx18/cx18-fileops.c
index a8dbb922ba4b9..a3f44e30f8219 100644
--- a/drivers/media/pci/cx18/cx18-fileops.c
+++ b/drivers/media/pci/cx18/cx18-fileops.c
@@ -613,7 +613,7 @@ __poll_t cx18_v4l2_enc_poll(struct file *filp, poll_table *wait)
 
 	/* Start a capture if there is none */
 	if (!eof && !test_bit(CX18_F_S_STREAMING, &s->s_flags) &&
-			(req_events & (POLLIN | POLLRDNORM))) {
+			(req_events & (EPOLLIN | EPOLLRDNORM))) {
 		int rc;
 
 		mutex_lock(&cx->serialize_lock);
@@ -622,7 +622,7 @@ __poll_t cx18_v4l2_enc_poll(struct file *filp, poll_table *wait)
 		if (rc) {
 			CX18_DEBUG_INFO("Could not start capture for %s (%d)\n",
 					s->name, rc);
-			return POLLERR;
+			return EPOLLERR;
 		}
 		CX18_DEBUG_FILE("Encoder poll started capture\n");
 	}
@@ -632,23 +632,23 @@ __poll_t cx18_v4l2_enc_poll(struct file *filp, poll_table *wait)
 		__poll_t videobuf_poll = videobuf_poll_stream(filp, &s->vbuf_q, wait);
 
 		if (v4l2_event_pending(&id->fh))
-			res |= POLLPRI;
-		if (eof && videobuf_poll == POLLERR)
-			return res | POLLHUP;
+			res |= EPOLLPRI;
+		if (eof && videobuf_poll == EPOLLERR)
+			return res | EPOLLHUP;
 		return res | videobuf_poll;
 	}
 
 	/* add stream's waitq to the poll list */
 	CX18_DEBUG_HI_FILE("Encoder poll\n");
 	if (v4l2_event_pending(&id->fh))
-		res |= POLLPRI;
+		res |= EPOLLPRI;
 	else
 		poll_wait(filp, &s->waitq, wait);
 
 	if (atomic_read(&s->q_full.depth))
-		return res | POLLIN | POLLRDNORM;
+		return res | EPOLLIN | EPOLLRDNORM;
 	if (eof)
-		return res | POLLHUP;
+		return res | EPOLLHUP;
 	return res;
 }
 
diff --git a/drivers/media/pci/ddbridge/ddbridge-core.c b/drivers/media/pci/ddbridge/ddbridge-core.c
index 42b42824382cf..f9bee36f1cadb 100644
--- a/drivers/media/pci/ddbridge/ddbridge-core.c
+++ b/drivers/media/pci/ddbridge/ddbridge-core.c
@@ -745,9 +745,9 @@ static __poll_t ts_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &input->dma->wq, wait);
 	poll_wait(file, &output->dma->wq, wait);
 	if (ddb_input_avail(input) >= 188)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ddb_output_free(output) >= 188)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 }
 
diff --git a/drivers/media/pci/ivtv/ivtv-fileops.c b/drivers/media/pci/ivtv/ivtv-fileops.c
index 4aa7735072017..6196daae4b3e0 100644
--- a/drivers/media/pci/ivtv/ivtv-fileops.c
+++ b/drivers/media/pci/ivtv/ivtv-fileops.c
@@ -747,7 +747,7 @@ __poll_t ivtv_v4l2_dec_poll(struct file *filp, poll_table *wait)
 		/* Turn off the old-style vsync events */
 		clear_bit(IVTV_F_I_EV_VSYNC_ENABLED, &itv->i_flags);
 		if (v4l2_event_pending(&id->fh))
-			res = POLLPRI;
+			res = EPOLLPRI;
 	} else {
 		/* This is the old-style API which is here only for backwards
 		   compatibility. */
@@ -755,12 +755,12 @@ __poll_t ivtv_v4l2_dec_poll(struct file *filp, poll_table *wait)
 		set_bit(IVTV_F_I_EV_VSYNC_ENABLED, &itv->i_flags);
 		if (test_bit(IVTV_F_I_EV_VSYNC, &itv->i_flags) ||
 		    test_bit(IVTV_F_I_EV_DEC_STOPPED, &itv->i_flags))
-			res = POLLPRI;
+			res = EPOLLPRI;
 	}
 
 	/* Allow write if buffers are available for writing */
 	if (s->q_free.buffers)
-		res |= POLLOUT | POLLWRNORM;
+		res |= EPOLLOUT | EPOLLWRNORM;
 	return res;
 }
 
@@ -776,7 +776,7 @@ __poll_t ivtv_v4l2_enc_poll(struct file *filp, poll_table *wait)
 	/* Start a capture if there is none */
 	if (!eof && !test_bit(IVTV_F_S_STREAMING, &s->s_flags) &&
 			s->type != IVTV_ENC_STREAM_TYPE_RAD &&
-			(req_events & (POLLIN | POLLRDNORM))) {
+			(req_events & (EPOLLIN | EPOLLRDNORM))) {
 		int rc;
 
 		mutex_lock(&itv->serialize_lock);
@@ -785,7 +785,7 @@ __poll_t ivtv_v4l2_enc_poll(struct file *filp, poll_table *wait)
 		if (rc) {
 			IVTV_DEBUG_INFO("Could not start capture for %s (%d)\n",
 					s->name, rc);
-			return POLLERR;
+			return EPOLLERR;
 		}
 		IVTV_DEBUG_FILE("Encoder poll started capture\n");
 	}
@@ -794,14 +794,14 @@ __poll_t ivtv_v4l2_enc_poll(struct file *filp, poll_table *wait)
 	IVTV_DEBUG_HI_FILE("Encoder poll\n");
 	poll_wait(filp, &s->waitq, wait);
 	if (v4l2_event_pending(&id->fh))
-		res |= POLLPRI;
+		res |= EPOLLPRI;
 	else
 		poll_wait(filp, &id->fh.wait, wait);
 
 	if (s->q_full.length || s->q_io.length)
-		return res | POLLIN | POLLRDNORM;
+		return res | EPOLLIN | EPOLLRDNORM;
 	if (eof)
-		return res | POLLHUP;
+		return res | EPOLLHUP;
 	return res;
 }
 
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c
index ae83293723bac..dedcdb5734270 100644
--- a/drivers/media/pci/meye/meye.c
+++ b/drivers/media/pci/meye/meye.c
@@ -1430,7 +1430,7 @@ static __poll_t meye_poll(struct file *file, poll_table *wait)
 	mutex_lock(&meye.lock);
 	poll_wait(file, &meye.proc_list, wait);
 	if (kfifo_len(&meye.doneq))
-		res |= POLLIN | POLLRDNORM;
+		res |= EPOLLIN | EPOLLRDNORM;
 	mutex_unlock(&meye.lock);
 	return res;
 }
diff --git a/drivers/media/pci/saa7164/saa7164-encoder.c b/drivers/media/pci/saa7164/saa7164-encoder.c
index e7b31a5b14fdd..32136ebe4f61c 100644
--- a/drivers/media/pci/saa7164/saa7164-encoder.c
+++ b/drivers/media/pci/saa7164/saa7164-encoder.c
@@ -925,13 +925,13 @@ static __poll_t fops_poll(struct file *file, poll_table *wait)
 	saa7164_histogram_update(&port->poll_interval,
 		port->last_poll_msecs_diff);
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return mask;
 
 	if (atomic_cmpxchg(&fh->v4l_reading, 0, 1) == 0) {
 		if (atomic_inc_return(&port->v4l_reader_count) == 1) {
 			if (saa7164_encoder_initialize(port) < 0)
-				return mask | POLLERR;
+				return mask | EPOLLERR;
 			saa7164_encoder_start_streaming(port);
 			msleep(200);
 		}
@@ -939,7 +939,7 @@ static __poll_t fops_poll(struct file *file, poll_table *wait)
 
 	/* Pull the first buffer from the used list */
 	if (!list_empty(&port->list_buf_used.list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/media/pci/saa7164/saa7164-vbi.c b/drivers/media/pci/saa7164/saa7164-vbi.c
index 6f97c8f2e00d8..64ab91c24c186 100644
--- a/drivers/media/pci/saa7164/saa7164-vbi.c
+++ b/drivers/media/pci/saa7164/saa7164-vbi.c
@@ -650,7 +650,7 @@ static __poll_t fops_poll(struct file *file, poll_table *wait)
 
 	/* Pull the first buffer from the used list */
 	if (!list_empty(&port->list_buf_used.list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/media/pci/ttpci/av7110_av.c b/drivers/media/pci/ttpci/av7110_av.c
index 4d10e2f979d20..4daba76ec240b 100644
--- a/drivers/media/pci/ttpci/av7110_av.c
+++ b/drivers/media/pci/ttpci/av7110_av.c
@@ -951,15 +951,15 @@ static __poll_t dvb_video_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &av7110->video_events.wait_queue, wait);
 
 	if (av7110->video_events.eventw != av7110->video_events.eventr)
-		mask = POLLPRI;
+		mask = EPOLLPRI;
 
 	if ((file->f_flags & O_ACCMODE) != O_RDONLY) {
 		if (av7110->playing) {
 			if (FREE_COND)
-				mask |= (POLLOUT | POLLWRNORM);
+				mask |= (EPOLLOUT | EPOLLWRNORM);
 		} else {
 			/* if not playing: may play if asked for */
-			mask |= (POLLOUT | POLLWRNORM);
+			mask |= (EPOLLOUT | EPOLLWRNORM);
 		}
 	}
 
@@ -1001,9 +1001,9 @@ static __poll_t dvb_audio_poll(struct file *file, poll_table *wait)
 
 	if (av7110->playing) {
 		if (dvb_ringbuffer_free(&av7110->aout) >= 20 * 1024)
-			mask |= (POLLOUT | POLLWRNORM);
+			mask |= (EPOLLOUT | EPOLLWRNORM);
 	} else /* if not playing: may play if asked for */
-		mask = (POLLOUT | POLLWRNORM);
+		mask = (EPOLLOUT | EPOLLWRNORM);
 
 	return mask;
 }
diff --git a/drivers/media/pci/ttpci/av7110_ca.c b/drivers/media/pci/ttpci/av7110_ca.c
index 96ca227cf51b6..d8c2f1b34d74c 100644
--- a/drivers/media/pci/ttpci/av7110_ca.c
+++ b/drivers/media/pci/ttpci/av7110_ca.c
@@ -237,10 +237,10 @@ static __poll_t dvb_ca_poll (struct file *file, poll_table *wait)
 	poll_wait(file, &wbuf->queue, wait);
 
 	if (!dvb_ringbuffer_empty(rbuf))
-		mask |= (POLLIN | POLLRDNORM);
+		mask |= (EPOLLIN | EPOLLRDNORM);
 
 	if (dvb_ringbuffer_free(wbuf) > 1024)
-		mask |= (POLLOUT | POLLWRNORM);
+		mask |= (EPOLLOUT | EPOLLWRNORM);
 
 	return mask;
 }
diff --git a/drivers/media/pci/zoran/zoran_driver.c b/drivers/media/pci/zoran/zoran_driver.c
index c464dae0389c9..8d4e7d930a663 100644
--- a/drivers/media/pci/zoran/zoran_driver.c
+++ b/drivers/media/pci/zoran/zoran_driver.c
@@ -2513,10 +2513,10 @@ zoran_poll (struct file *file,
 
 	/* we should check whether buffers are ready to be synced on
 	 * (w/o waits - O_NONBLOCK) here
-	 * if ready for read (sync), return POLLIN|POLLRDNORM,
-	 * if ready for write (sync), return POLLOUT|POLLWRNORM,
-	 * if error, return POLLERR,
-	 * if no buffers queued or so, return POLLNVAL
+	 * if ready for read (sync), return EPOLLIN|EPOLLRDNORM,
+	 * if ready for write (sync), return EPOLLOUT|EPOLLWRNORM,
+	 * if error, return EPOLLERR,
+	 * if no buffers queued or so, return EPOLLNVAL
 	 */
 
 	switch (fh->map_mode) {
@@ -2536,7 +2536,7 @@ zoran_poll (struct file *file,
 		if (fh->buffers.active != ZORAN_FREE &&
 		    /* Buffer ready to DQBUF? */
 		    zr->v4l_buffers.buffer[frame].state == BUZ_STATE_DONE)
-			res |= POLLIN | POLLRDNORM;
+			res |= EPOLLIN | EPOLLRDNORM;
 		spin_unlock_irqrestore(&zr->spinlock, flags);
 
 		break;
@@ -2557,9 +2557,9 @@ zoran_poll (struct file *file,
 		if (fh->buffers.active != ZORAN_FREE &&
 		    zr->jpg_buffers.buffer[frame].state == BUZ_STATE_DONE) {
 			if (fh->map_mode == ZORAN_MAP_MODE_JPG_REC)
-				res |= POLLIN | POLLRDNORM;
+				res |= EPOLLIN | EPOLLRDNORM;
 			else
-				res |= POLLOUT | POLLWRNORM;
+				res |= EPOLLOUT | EPOLLWRNORM;
 		}
 		spin_unlock_irqrestore(&zr->spinlock, flags);
 
@@ -2570,7 +2570,7 @@ zoran_poll (struct file *file,
 			KERN_ERR
 			"%s: %s - internal error, unknown map_mode=%d\n",
 			ZR_DEVNAME(zr), __func__, fh->map_mode);
-		res |= POLLERR;
+		res |= EPOLLERR;
 	}
 
 	return res;
diff --git a/drivers/media/platform/fsl-viu.c b/drivers/media/platform/fsl-viu.c
index de285a269390a..200c47c69a758 100644
--- a/drivers/media/platform/fsl-viu.c
+++ b/drivers/media/platform/fsl-viu.c
@@ -1272,9 +1272,9 @@ static __poll_t viu_poll(struct file *file, struct poll_table_struct *wait)
 	__poll_t res = v4l2_ctrl_poll(file, wait);
 
 	if (V4L2_BUF_TYPE_VIDEO_CAPTURE != fh->type)
-		return POLLERR;
+		return EPOLLERR;
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return res;
 
 	mutex_lock(&dev->lock);
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c
index f15cf24c1c639..d5b94fc0040e4 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c
@@ -1008,7 +1008,7 @@ static __poll_t s5p_mfc_poll(struct file *file,
 	 */
 	if ((!src_q->streaming || list_empty(&src_q->queued_list))
 		&& (!dst_q->streaming || list_empty(&dst_q->queued_list))) {
-		rc = POLLERR;
+		rc = EPOLLERR;
 		goto end;
 	}
 	mutex_unlock(&dev->mfc_mutex);
@@ -1017,14 +1017,14 @@ static __poll_t s5p_mfc_poll(struct file *file,
 	poll_wait(file, &dst_q->done_wq, wait);
 	mutex_lock(&dev->mfc_mutex);
 	if (v4l2_event_pending(&ctx->fh))
-		rc |= POLLPRI;
+		rc |= EPOLLPRI;
 	spin_lock_irqsave(&src_q->done_lock, flags);
 	if (!list_empty(&src_q->done_list))
 		src_vb = list_first_entry(&src_q->done_list, struct vb2_buffer,
 								done_entry);
 	if (src_vb && (src_vb->state == VB2_BUF_STATE_DONE
 				|| src_vb->state == VB2_BUF_STATE_ERROR))
-		rc |= POLLOUT | POLLWRNORM;
+		rc |= EPOLLOUT | EPOLLWRNORM;
 	spin_unlock_irqrestore(&src_q->done_lock, flags);
 	spin_lock_irqsave(&dst_q->done_lock, flags);
 	if (!list_empty(&dst_q->done_list))
@@ -1032,7 +1032,7 @@ static __poll_t s5p_mfc_poll(struct file *file,
 								done_entry);
 	if (dst_vb && (dst_vb->state == VB2_BUF_STATE_DONE
 				|| dst_vb->state == VB2_BUF_STATE_ERROR))
-		rc |= POLLIN | POLLRDNORM;
+		rc |= EPOLLIN | EPOLLRDNORM;
 	spin_unlock_irqrestore(&dst_q->done_lock, flags);
 end:
 	mutex_unlock(&dev->mfc_mutex);
diff --git a/drivers/media/platform/soc_camera/soc_camera.c b/drivers/media/platform/soc_camera/soc_camera.c
index 70fc5f01942de..c86dd2fdab84a 100644
--- a/drivers/media/platform/soc_camera/soc_camera.c
+++ b/drivers/media/platform/soc_camera/soc_camera.c
@@ -809,10 +809,10 @@ static __poll_t soc_camera_poll(struct file *file, poll_table *pt)
 {
 	struct soc_camera_device *icd = file->private_data;
 	struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-	__poll_t res = POLLERR;
+	__poll_t res = EPOLLERR;
 
 	if (icd->streamer != file)
-		return POLLERR;
+		return EPOLLERR;
 
 	mutex_lock(&ici->host_lock);
 	res = ici->ops->poll(file, pt);
diff --git a/drivers/media/platform/vivid/vivid-radio-rx.c b/drivers/media/platform/vivid/vivid-radio-rx.c
index fcb7a9f015b6b..f834f7df8cf9f 100644
--- a/drivers/media/platform/vivid/vivid-radio-rx.c
+++ b/drivers/media/platform/vivid/vivid-radio-rx.c
@@ -142,7 +142,7 @@ ssize_t vivid_radio_rx_read(struct file *file, char __user *buf,
 
 __poll_t vivid_radio_rx_poll(struct file *file, struct poll_table_struct *wait)
 {
-	return POLLIN | POLLRDNORM | v4l2_ctrl_poll(file, wait);
+	return EPOLLIN | EPOLLRDNORM | v4l2_ctrl_poll(file, wait);
 }
 
 int vivid_radio_rx_enum_freq_bands(struct file *file, void *fh, struct v4l2_frequency_band *band)
diff --git a/drivers/media/platform/vivid/vivid-radio-tx.c b/drivers/media/platform/vivid/vivid-radio-tx.c
index af4907a197a33..308b13f85dc08 100644
--- a/drivers/media/platform/vivid/vivid-radio-tx.c
+++ b/drivers/media/platform/vivid/vivid-radio-tx.c
@@ -105,7 +105,7 @@ ssize_t vivid_radio_tx_write(struct file *file, const char __user *buf,
 
 __poll_t vivid_radio_tx_poll(struct file *file, struct poll_table_struct *wait)
 {
-	return POLLOUT | POLLWRNORM | v4l2_ctrl_poll(file, wait);
+	return EPOLLOUT | EPOLLWRNORM | v4l2_ctrl_poll(file, wait);
 }
 
 int vidioc_g_modulator(struct file *file, void *fh, struct v4l2_modulator *a)
diff --git a/drivers/media/radio/radio-cadet.c b/drivers/media/radio/radio-cadet.c
index af7c68b344d1a..5b82e63885cdd 100644
--- a/drivers/media/radio/radio-cadet.c
+++ b/drivers/media/radio/radio-cadet.c
@@ -488,14 +488,14 @@ static __poll_t cadet_poll(struct file *file, struct poll_table_struct *wait)
 	__poll_t res = v4l2_ctrl_poll(file, wait);
 
 	poll_wait(file, &dev->read_queue, wait);
-	if (dev->rdsstat == 0 && (req_events & (POLLIN | POLLRDNORM))) {
+	if (dev->rdsstat == 0 && (req_events & (EPOLLIN | EPOLLRDNORM))) {
 		mutex_lock(&dev->lock);
 		if (dev->rdsstat == 0)
 			cadet_start_rds(dev);
 		mutex_unlock(&dev->lock);
 	}
 	if (cadet_has_rds_data(dev))
-		res |= POLLIN | POLLRDNORM;
+		res |= EPOLLIN | EPOLLRDNORM;
 	return res;
 }
 
diff --git a/drivers/media/radio/radio-si476x.c b/drivers/media/radio/radio-si476x.c
index bff9789ae9bc1..b52e678c6901c 100644
--- a/drivers/media/radio/radio-si476x.c
+++ b/drivers/media/radio/radio-si476x.c
@@ -1158,15 +1158,15 @@ static __poll_t si476x_radio_fops_poll(struct file *file,
 	__poll_t req_events = poll_requested_events(pts);
 	__poll_t err = v4l2_ctrl_poll(file, pts);
 
-	if (req_events & (POLLIN | POLLRDNORM)) {
+	if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		if (atomic_read(&radio->core->is_alive))
 			poll_wait(file, &radio->core->rds_read_queue, pts);
 
 		if (!atomic_read(&radio->core->is_alive))
-			err = POLLHUP;
+			err = EPOLLHUP;
 
 		if (!kfifo_is_empty(&radio->core->rds_fifo))
-			err = POLLIN | POLLRDNORM;
+			err = EPOLLIN | EPOLLRDNORM;
 	}
 
 	return err;
diff --git a/drivers/media/radio/radio-wl1273.c b/drivers/media/radio/radio-wl1273.c
index f92b0f9241a99..58e9445916026 100644
--- a/drivers/media/radio/radio-wl1273.c
+++ b/drivers/media/radio/radio-wl1273.c
@@ -1104,10 +1104,10 @@ static __poll_t wl1273_fm_fops_poll(struct file *file,
 		poll_wait(file, &radio->read_queue, pts);
 
 		if (radio->rd_index != radio->wr_index)
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 
 	} else if (core->mode == WL1273_MODE_TX) {
-		return POLLOUT | POLLWRNORM;
+		return EPOLLOUT | EPOLLWRNORM;
 	}
 
 	return 0;
diff --git a/drivers/media/radio/si470x/radio-si470x-common.c b/drivers/media/radio/si470x/radio-si470x-common.c
index 5b477b7d6a66d..e0054e0f410df 100644
--- a/drivers/media/radio/si470x/radio-si470x-common.c
+++ b/drivers/media/radio/si470x/radio-si470x-common.c
@@ -514,7 +514,7 @@ static __poll_t si470x_fops_poll(struct file *file,
 	__poll_t req_events = poll_requested_events(pts);
 	__poll_t retval = v4l2_ctrl_poll(file, pts);
 
-	if (req_events & (POLLIN | POLLRDNORM)) {
+	if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		/* switch on rds reception */
 		if ((radio->registers[SYSCONFIG1] & SYSCONFIG1_RDS) == 0)
 			si470x_rds_on(radio);
@@ -522,7 +522,7 @@ static __poll_t si470x_fops_poll(struct file *file,
 		poll_wait(file, &radio->read_queue, pts);
 
 		if (radio->rd_index != radio->wr_index)
-			retval |= POLLIN | POLLRDNORM;
+			retval |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	return retval;
diff --git a/drivers/media/radio/wl128x/fmdrv_v4l2.c b/drivers/media/radio/wl128x/fmdrv_v4l2.c
index fd603c1b96bbf..dccdf6558e6ab 100644
--- a/drivers/media/radio/wl128x/fmdrv_v4l2.c
+++ b/drivers/media/radio/wl128x/fmdrv_v4l2.c
@@ -112,7 +112,7 @@ static __poll_t fm_v4l2_fops_poll(struct file *file, struct poll_table_struct *p
 	ret = fmc_is_rds_data_available(fmdev, file, pts);
 	mutex_unlock(&fmdev->mutex);
 	if (ret < 0)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index b3544988586e8..cc863044c880a 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -109,7 +109,7 @@ void ir_lirc_raw_event(struct rc_dev *dev, struct ir_raw_event ev)
 		if (LIRC_IS_TIMEOUT(sample) && !fh->send_timeout_reports)
 			continue;
 		if (kfifo_put(&fh->rawir, sample))
-			wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM);
+			wake_up_poll(&fh->wait_poll, EPOLLIN | EPOLLRDNORM);
 	}
 	spin_unlock_irqrestore(&dev->lirc_fh_lock, flags);
 }
@@ -130,7 +130,7 @@ void ir_lirc_scancode_event(struct rc_dev *dev, struct lirc_scancode *lsc)
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list) {
 		if (kfifo_put(&fh->scancodes, *lsc))
-			wake_up_poll(&fh->wait_poll, POLLIN | POLLRDNORM);
+			wake_up_poll(&fh->wait_poll, EPOLLIN | EPOLLRDNORM);
 	}
 	spin_unlock_irqrestore(&dev->lirc_fh_lock, flags);
 }
@@ -603,15 +603,15 @@ static __poll_t ir_lirc_poll(struct file *file, struct poll_table_struct *wait)
 	poll_wait(file, &fh->wait_poll, wait);
 
 	if (!rcdev->registered) {
-		events = POLLHUP | POLLERR;
+		events = EPOLLHUP | EPOLLERR;
 	} else if (rcdev->driver_type != RC_DRIVER_IR_RAW_TX) {
 		if (fh->rec_mode == LIRC_MODE_SCANCODE &&
 		    !kfifo_is_empty(&fh->scancodes))
-			events = POLLIN | POLLRDNORM;
+			events = EPOLLIN | EPOLLRDNORM;
 
 		if (fh->rec_mode == LIRC_MODE_MODE2 &&
 		    !kfifo_is_empty(&fh->rawir))
-			events = POLLIN | POLLRDNORM;
+			events = EPOLLIN | EPOLLRDNORM;
 	}
 
 	return events;
@@ -779,7 +779,7 @@ void ir_lirc_unregister(struct rc_dev *dev)
 
 	spin_lock_irqsave(&dev->lirc_fh_lock, flags);
 	list_for_each_entry(fh, &dev->lirc_fh, list)
-		wake_up_poll(&fh->wait_poll, POLLHUP | POLLERR);
+		wake_up_poll(&fh->wait_poll, EPOLLHUP | EPOLLERR);
 	spin_unlock_irqrestore(&dev->lirc_fh_lock, flags);
 
 	cdev_device_del(&dev->lirc_cdev, &dev->lirc_dev);
diff --git a/drivers/media/usb/cpia2/cpia2_core.c b/drivers/media/usb/cpia2/cpia2_core.c
index e7524920c6187..3dfbb545c0e38 100644
--- a/drivers/media/usb/cpia2/cpia2_core.c
+++ b/drivers/media/usb/cpia2/cpia2_core.c
@@ -2375,7 +2375,7 @@ __poll_t cpia2_poll(struct camera_data *cam, struct file *filp,
 {
 	__poll_t status = v4l2_ctrl_poll(filp, wait);
 
-	if ((poll_requested_events(wait) & (POLLIN | POLLRDNORM)) &&
+	if ((poll_requested_events(wait) & (EPOLLIN | EPOLLRDNORM)) &&
 			!cam->streaming) {
 		/* Start streaming */
 		cpia2_usb_stream_start(cam,
@@ -2385,7 +2385,7 @@ __poll_t cpia2_poll(struct camera_data *cam, struct file *filp,
 	poll_wait(filp, &cam->wq_stream, wait);
 
 	if (cam->curbuff->status == FRAME_READY)
-		status |= POLLIN | POLLRDNORM;
+		status |= EPOLLIN | EPOLLRDNORM;
 
 	return status;
 }
diff --git a/drivers/media/usb/cx231xx/cx231xx-417.c b/drivers/media/usb/cx231xx/cx231xx-417.c
index 103e3299b77f1..b80e6857e2eba 100644
--- a/drivers/media/usb/cx231xx/cx231xx-417.c
+++ b/drivers/media/usb/cx231xx/cx231xx-417.c
@@ -1821,11 +1821,11 @@ static __poll_t mpeg_poll(struct file *file,
 	__poll_t res = 0;
 
 	if (v4l2_event_pending(&fh->fh))
-		res |= POLLPRI;
+		res |= EPOLLPRI;
 	else
 		poll_wait(file, &fh->fh.wait, wait);
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return res;
 
 	mutex_lock(&dev->lock);
diff --git a/drivers/media/usb/cx231xx/cx231xx-video.c b/drivers/media/usb/cx231xx/cx231xx-video.c
index 271f35208c494..5b321b8ada3ac 100644
--- a/drivers/media/usb/cx231xx/cx231xx-video.c
+++ b/drivers/media/usb/cx231xx/cx231xx-video.c
@@ -2018,19 +2018,19 @@ static __poll_t cx231xx_v4l2_poll(struct file *filp, poll_table *wait)
 
 	rc = check_dev(dev);
 	if (rc < 0)
-		return POLLERR;
+		return EPOLLERR;
 
 	rc = res_get(fh);
 
 	if (unlikely(rc < 0))
-		return POLLERR;
+		return EPOLLERR;
 
 	if (v4l2_event_pending(&fh->fh))
-		res |= POLLPRI;
+		res |= EPOLLPRI;
 	else
 		poll_wait(filp, &fh->fh.wait, wait);
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return res;
 
 	if ((V4L2_BUF_TYPE_VIDEO_CAPTURE == fh->type) ||
@@ -2040,7 +2040,7 @@ static __poll_t cx231xx_v4l2_poll(struct file *filp, poll_table *wait)
 		mutex_unlock(&dev->lock);
 		return res;
 	}
-	return res | POLLERR;
+	return res | EPOLLERR;
 }
 
 /*
diff --git a/drivers/media/usb/gspca/gspca.c b/drivers/media/usb/gspca/gspca.c
index 87e18d0c57664..d29773b8f696d 100644
--- a/drivers/media/usb/gspca/gspca.c
+++ b/drivers/media/usb/gspca/gspca.c
@@ -1877,14 +1877,14 @@ static __poll_t dev_poll(struct file *file, poll_table *wait)
 
 	gspca_dbg(gspca_dev, D_FRAM, "poll\n");
 
-	if (req_events & POLLPRI)
+	if (req_events & EPOLLPRI)
 		ret |= v4l2_ctrl_poll(file, wait);
 
-	if (req_events & (POLLIN | POLLRDNORM)) {
+	if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		/* if reqbufs is not done, the user would use read() */
 		if (gspca_dev->memory == GSPCA_MEMORY_NO) {
 			if (read_alloc(gspca_dev, file) != 0) {
-				ret |= POLLERR;
+				ret |= EPOLLERR;
 				goto out;
 			}
 		}
@@ -1893,17 +1893,17 @@ static __poll_t dev_poll(struct file *file, poll_table *wait)
 
 		/* check if an image has been received */
 		if (mutex_lock_interruptible(&gspca_dev->queue_lock) != 0) {
-			ret |= POLLERR;
+			ret |= EPOLLERR;
 			goto out;
 		}
 		if (gspca_dev->fr_o != atomic_read(&gspca_dev->fr_i))
-			ret |= POLLIN | POLLRDNORM;
+			ret |= EPOLLIN | EPOLLRDNORM;
 		mutex_unlock(&gspca_dev->queue_lock);
 	}
 
 out:
 	if (!gspca_dev->present)
-		ret |= POLLHUP;
+		ret |= EPOLLHUP;
 
 	return ret;
 }
diff --git a/drivers/media/usb/hdpvr/hdpvr-video.c b/drivers/media/usb/hdpvr/hdpvr-video.c
index 660d4a65401f5..77c3d331ff314 100644
--- a/drivers/media/usb/hdpvr/hdpvr-video.c
+++ b/drivers/media/usb/hdpvr/hdpvr-video.c
@@ -528,7 +528,7 @@ static __poll_t hdpvr_poll(struct file *filp, poll_table *wait)
 	struct hdpvr_device *dev = video_drvdata(filp);
 	__poll_t mask = v4l2_ctrl_poll(filp, wait);
 
-	if (!(req_events & (POLLIN | POLLRDNORM)))
+	if (!(req_events & (EPOLLIN | EPOLLRDNORM)))
 		return mask;
 
 	mutex_lock(&dev->io_mutex);
@@ -553,7 +553,7 @@ static __poll_t hdpvr_poll(struct file *filp, poll_table *wait)
 		buf = hdpvr_get_next_buffer(dev);
 	}
 	if (buf && buf->status == BUFSTAT_READY)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c
index ad6290e1b6999..9fdc57c1658fc 100644
--- a/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c
+++ b/drivers/media/usb/pvrusb2/pvrusb2-v4l2.c
@@ -1181,19 +1181,19 @@ static __poll_t pvr2_v4l2_poll(struct file *file, poll_table *wait)
 	int ret;
 
 	if (fh->fw_mode_flag) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 		return mask;
 	}
 
 	if (!fh->rhp) {
 		ret = pvr2_v4l2_iosetup(fh);
-		if (ret) return POLLERR;
+		if (ret) return EPOLLERR;
 	}
 
 	poll_wait(file,&fh->wait_data,wait);
 
 	if (pvr2_ioread_avail(fh->rhp) >= 0) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	return mask;
diff --git a/drivers/media/usb/stkwebcam/stk-webcam.c b/drivers/media/usb/stkwebcam/stk-webcam.c
index 17ad978c01726..22389b56ec246 100644
--- a/drivers/media/usb/stkwebcam/stk-webcam.c
+++ b/drivers/media/usb/stkwebcam/stk-webcam.c
@@ -729,10 +729,10 @@ static __poll_t v4l_stk_poll(struct file *fp, poll_table *wait)
 	poll_wait(fp, &dev->wait_frame, wait);
 
 	if (!is_present(dev))
-		return POLLERR;
+		return EPOLLERR;
 
 	if (!list_empty(&dev->sio_full))
-		return res | POLLIN | POLLRDNORM;
+		return res | EPOLLIN | EPOLLRDNORM;
 
 	return res;
 }
diff --git a/drivers/media/usb/tm6000/tm6000-video.c b/drivers/media/usb/tm6000/tm6000-video.c
index 96266fa4738c5..8314d3fa9241b 100644
--- a/drivers/media/usb/tm6000/tm6000-video.c
+++ b/drivers/media/usb/tm6000/tm6000-video.c
@@ -1424,25 +1424,25 @@ __tm6000_poll(struct file *file, struct poll_table_struct *wait)
 	__poll_t res = 0;
 
 	if (v4l2_event_pending(&fh->fh))
-		res = POLLPRI;
-	else if (req_events & POLLPRI)
+		res = EPOLLPRI;
+	else if (req_events & EPOLLPRI)
 		poll_wait(file, &fh->fh.wait, wait);
 	if (V4L2_BUF_TYPE_VIDEO_CAPTURE != fh->type)
-		return res | POLLERR;
+		return res | EPOLLERR;
 
 	if (!!is_res_streaming(fh->dev, fh))
-		return res | POLLERR;
+		return res | EPOLLERR;
 
 	if (!is_res_read(fh->dev, fh)) {
 		/* streaming capture */
 		if (list_empty(&fh->vb_vidq.stream))
-			return res | POLLERR;
+			return res | EPOLLERR;
 		buf = list_entry(fh->vb_vidq.stream.next, struct tm6000_buffer, vb.stream);
 		poll_wait(file, &buf->vb.done, wait);
 		if (buf->vb.state == VIDEOBUF_DONE ||
 		    buf->vb.state == VIDEOBUF_ERROR)
-			return res | POLLIN | POLLRDNORM;
-	} else if (req_events & (POLLIN | POLLRDNORM)) {
+			return res | EPOLLIN | EPOLLRDNORM;
+	} else if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		/* read() capture */
 		return res | videobuf_poll_stream(file, &fh->vb_vidq, wait);
 	}
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index b076571494345..ce08b50b82900 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -3462,7 +3462,7 @@ __poll_t v4l2_ctrl_poll(struct file *file, struct poll_table_struct *wait)
 	struct v4l2_fh *fh = file->private_data;
 
 	if (v4l2_event_pending(fh))
-		return POLLPRI;
+		return EPOLLPRI;
 	poll_wait(file, &fh->wait, wait);
 	return 0;
 }
diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c
index cd8127d3f863b..0301fe426a435 100644
--- a/drivers/media/v4l2-core/v4l2-dev.c
+++ b/drivers/media/v4l2-core/v4l2-dev.c
@@ -334,7 +334,7 @@ static ssize_t v4l2_write(struct file *filp, const char __user *buf,
 static __poll_t v4l2_poll(struct file *filp, struct poll_table_struct *poll)
 {
 	struct video_device *vdev = video_devdata(filp);
-	__poll_t res = POLLERR | POLLHUP;
+	__poll_t res = EPOLLERR | EPOLLHUP;
 
 	if (!vdev->fops->poll)
 		return DEFAULT_POLLMASK;
diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 186156f8952ab..c4f963d96a79d 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -514,10 +514,10 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 		struct v4l2_fh *fh = file->private_data;
 
 		if (v4l2_event_pending(fh))
-			rc = POLLPRI;
-		else if (req_events & POLLPRI)
+			rc = EPOLLPRI;
+		else if (req_events & EPOLLPRI)
 			poll_wait(file, &fh->wait, wait);
-		if (!(req_events & (POLLOUT | POLLWRNORM | POLLIN | POLLRDNORM)))
+		if (!(req_events & (EPOLLOUT | EPOLLWRNORM | EPOLLIN | EPOLLRDNORM)))
 			return rc;
 	}
 
@@ -531,7 +531,7 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 	 */
 	if ((!src_q->streaming || list_empty(&src_q->queued_list))
 		&& (!dst_q->streaming || list_empty(&dst_q->queued_list))) {
-		rc |= POLLERR;
+		rc |= EPOLLERR;
 		goto end;
 	}
 
@@ -548,7 +548,7 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 		 */
 		if (dst_q->last_buffer_dequeued) {
 			spin_unlock_irqrestore(&dst_q->done_lock, flags);
-			return rc | POLLIN | POLLRDNORM;
+			return rc | EPOLLIN | EPOLLRDNORM;
 		}
 
 		poll_wait(file, &dst_q->done_wq, wait);
@@ -561,7 +561,7 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 						done_entry);
 	if (src_vb && (src_vb->state == VB2_BUF_STATE_DONE
 			|| src_vb->state == VB2_BUF_STATE_ERROR))
-		rc |= POLLOUT | POLLWRNORM;
+		rc |= EPOLLOUT | EPOLLWRNORM;
 	spin_unlock_irqrestore(&src_q->done_lock, flags);
 
 	spin_lock_irqsave(&dst_q->done_lock, flags);
@@ -570,7 +570,7 @@ __poll_t v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx,
 						done_entry);
 	if (dst_vb && (dst_vb->state == VB2_BUF_STATE_DONE
 			|| dst_vb->state == VB2_BUF_STATE_ERROR))
-		rc |= POLLIN | POLLRDNORM;
+		rc |= EPOLLIN | EPOLLRDNORM;
 	spin_unlock_irqrestore(&dst_q->done_lock, flags);
 
 end:
diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 28966fa8c6100..c5639817db349 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -476,12 +476,12 @@ static __poll_t subdev_poll(struct file *file, poll_table *wait)
 	struct v4l2_fh *fh = file->private_data;
 
 	if (!(sd->flags & V4L2_SUBDEV_FL_HAS_EVENTS))
-		return POLLERR;
+		return EPOLLERR;
 
 	poll_wait(file, &fh->wait, wait);
 
 	if (v4l2_event_pending(fh))
-		return POLLPRI;
+		return EPOLLPRI;
 
 	return 0;
 }
diff --git a/drivers/media/v4l2-core/videobuf-core.c b/drivers/media/v4l2-core/videobuf-core.c
index 9a89d3ae170fd..2b3981842b4b7 100644
--- a/drivers/media/v4l2-core/videobuf-core.c
+++ b/drivers/media/v4l2-core/videobuf-core.c
@@ -1131,11 +1131,11 @@ __poll_t videobuf_poll_stream(struct file *file,
 		if (!list_empty(&q->stream))
 			buf = list_entry(q->stream.next,
 					 struct videobuf_buffer, stream);
-	} else if (req_events & (POLLIN | POLLRDNORM)) {
+	} else if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		if (!q->reading)
 			__videobuf_read_start(q);
 		if (!q->reading) {
-			rc = POLLERR;
+			rc = EPOLLERR;
 		} else if (NULL == q->read_buf) {
 			q->read_buf = list_entry(q->stream.next,
 						 struct videobuf_buffer,
@@ -1146,7 +1146,7 @@ __poll_t videobuf_poll_stream(struct file *file,
 		buf = q->read_buf;
 	}
 	if (!buf)
-		rc = POLLERR;
+		rc = EPOLLERR;
 
 	if (0 == rc) {
 		poll_wait(file, &buf->done, wait);
@@ -1157,10 +1157,10 @@ __poll_t videobuf_poll_stream(struct file *file,
 			case V4L2_BUF_TYPE_VBI_OUTPUT:
 			case V4L2_BUF_TYPE_SLICED_VBI_OUTPUT:
 			case V4L2_BUF_TYPE_SDR_OUTPUT:
-				rc = POLLOUT | POLLWRNORM;
+				rc = EPOLLOUT | EPOLLWRNORM;
 				break;
 			default:
-				rc = POLLIN | POLLRDNORM;
+				rc = EPOLLIN | EPOLLRDNORM;
 				break;
 			}
 		}
diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index fcb3a92ae85f8..8ba41073dd89f 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -1267,7 +1267,7 @@ static irqreturn_t ab8500_debug_handler(int irq, void *data)
 	if (irq_abb < num_irqs)
 		irq_count[irq_abb]++;
 	/*
-	 * This makes it possible to use poll for events (POLLPRI | POLLERR)
+	 * This makes it possible to use poll for events (EPOLLPRI | EPOLLERR)
 	 * from userspace on sysfs file named <irq-nr>
 	 */
 	sprintf(buf, "%d", irq);
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 0162516f5e57b..bd6ddbdb5cd16 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -378,11 +378,11 @@ __poll_t afu_poll(struct file *file, struct poll_table_struct *poll)
 
 	spin_lock_irqsave(&ctx->lock, flags);
 	if (ctx_event_pending(ctx))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	else if (ctx->status == CLOSED)
 		/* Only error on closed when there are no futher events pending
 		 */
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	spin_unlock_irqrestore(&ctx->lock, flags);
 
 	pr_devel("afu_poll pe: %i returning %#x\n", ctx->pe, mask);
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index 35693c0a78e28..e9c9ef52c76a8 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -519,9 +519,9 @@ static __poll_t ilo_poll(struct file *fp, poll_table *wait)
 	poll_wait(fp, &data->ccb_waitq, wait);
 
 	if (is_channel_reset(driver_ccb))
-		return POLLERR;
+		return EPOLLERR;
 	else if (ilo_pkt_recv(data->ilo_hw, driver_ccb))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/misc/lis3lv02d/lis3lv02d.c b/drivers/misc/lis3lv02d/lis3lv02d.c
index e49888eab87d5..e9bb1cfa6a7a2 100644
--- a/drivers/misc/lis3lv02d/lis3lv02d.c
+++ b/drivers/misc/lis3lv02d/lis3lv02d.c
@@ -658,7 +658,7 @@ static __poll_t lis3lv02d_misc_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &lis3->misc_wait, wait);
 	if (atomic_read(&lis3->count))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c
index 505b710291e69..758dc73602d5e 100644
--- a/drivers/misc/mei/main.c
+++ b/drivers/misc/mei/main.c
@@ -551,31 +551,31 @@ static __poll_t mei_poll(struct file *file, poll_table *wait)
 	bool notify_en;
 
 	if (WARN_ON(!cl || !cl->dev))
-		return POLLERR;
+		return EPOLLERR;
 
 	dev = cl->dev;
 
 	mutex_lock(&dev->device_lock);
 
-	notify_en = cl->notify_en && (req_events & POLLPRI);
+	notify_en = cl->notify_en && (req_events & EPOLLPRI);
 
 	if (dev->dev_state != MEI_DEV_ENABLED ||
 	    !mei_cl_is_connected(cl)) {
-		mask = POLLERR;
+		mask = EPOLLERR;
 		goto out;
 	}
 
 	if (notify_en) {
 		poll_wait(file, &cl->ev_wait, wait);
 		if (cl->notify_ev)
-			mask |= POLLPRI;
+			mask |= EPOLLPRI;
 	}
 
-	if (req_events & (POLLIN | POLLRDNORM)) {
+	if (req_events & (EPOLLIN | EPOLLRDNORM)) {
 		poll_wait(file, &cl->rx_wait, wait);
 
 		if (!list_empty(&cl->rd_completed))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		else
 			mei_cl_read_start(cl, mei_cl_mtu(cl), file);
 	}
diff --git a/drivers/misc/mic/cosm/cosm_scif_server.c b/drivers/misc/mic/cosm/cosm_scif_server.c
index 85f7d09cc65fd..05a63286741c8 100644
--- a/drivers/misc/mic/cosm/cosm_scif_server.c
+++ b/drivers/misc/mic/cosm/cosm_scif_server.c
@@ -55,7 +55,7 @@
  *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
  *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
  *    up the host COSM thread blocked in scif_poll(..) resulting in
- *    scif_poll(..)  returning POLLHUP.
+ *    scif_poll(..)  returning EPOLLHUP.
  * 5. On the card, scif_peer_release_dev is next called which results in an
  *    SCIF_EXIT message being sent to the host and after receiving the
  *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
@@ -79,7 +79,7 @@
  *    processing. This results in the COSM endpoint on the card being closed and
  *    the SCIF host peer device on the card getting unregistered similar to
  *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
- *    host returns POLLHUP as a result.
+ *    host returns EPOLLHUP as a result.
  * 4. On the host, card peer device unregister and SCIF HW remove(..) also
  *    subsequently complete.
  *
@@ -87,11 +87,11 @@
  * ----------
  * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
  * message from the card which would result in scif_poll(..) returning
- * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
+ * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
  * message to itself resulting in the card SCIF peer device being unregistered,
  * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
  * scif_invalidate_ep call sequence which sets the endpoint state to
- * DISCONNECTED and results in scif_poll(..) returning POLLHUP.
+ * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP.
  */
 
 #define COSM_SCIF_BACKLOG 16
@@ -190,7 +190,7 @@ static void cosm_send_time(struct cosm_device *cdev)
 
 /*
  * Close this cosm_device's endpoint after its peer endpoint on the card has
- * been closed. In all cases except MIC card crash POLLHUP on the host is
+ * been closed. In all cases except MIC card crash EPOLLHUP on the host is
  * triggered by the client's endpoint being closed.
  */
 static void cosm_scif_close(struct cosm_device *cdev)
@@ -252,7 +252,7 @@ void cosm_scif_work(struct work_struct *work)
 
 	while (1) {
 		pollepd.epd = cdev->epd;
-		pollepd.events = POLLIN;
+		pollepd.events = EPOLLIN;
 
 		/* Drop the mutex before blocking in scif_poll(..) */
 		mutex_unlock(&cdev->cosm_mutex);
@@ -266,11 +266,11 @@ void cosm_scif_work(struct work_struct *work)
 		}
 
 		/* There is a message from the card */
-		if (pollepd.revents & POLLIN)
+		if (pollepd.revents & EPOLLIN)
 			cosm_scif_recv(cdev);
 
 		/* The peer endpoint is closed or this endpoint disconnected */
-		if (pollepd.revents & POLLHUP) {
+		if (pollepd.revents & EPOLLHUP) {
 			cosm_scif_close(cdev);
 			break;
 		}
diff --git a/drivers/misc/mic/cosm_client/cosm_scif_client.c b/drivers/misc/mic/cosm_client/cosm_scif_client.c
index aa530fcceaa99..beafc0da40278 100644
--- a/drivers/misc/mic/cosm_client/cosm_scif_client.c
+++ b/drivers/misc/mic/cosm_client/cosm_scif_client.c
@@ -160,7 +160,7 @@ static int cosm_scif_client(void *unused)
 
 	while (!kthread_should_stop()) {
 		pollepd.epd = client_epd;
-		pollepd.events = POLLIN;
+		pollepd.events = EPOLLIN;
 
 		rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_SEND_MSEC);
 		if (rc < 0) {
@@ -171,7 +171,7 @@ static int cosm_scif_client(void *unused)
 			continue;
 		}
 
-		if (pollepd.revents & POLLIN)
+		if (pollepd.revents & EPOLLIN)
 			cosm_client_recv();
 
 		msg.id = COSM_MSG_HEARTBEAT;
diff --git a/drivers/misc/mic/scif/scif_api.c b/drivers/misc/mic/scif/scif_api.c
index 8a3e48ec37dd6..7b2dddcdd46d5 100644
--- a/drivers/misc/mic/scif/scif_api.c
+++ b/drivers/misc/mic/scif/scif_api.c
@@ -1328,7 +1328,7 @@ __scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep)
 			if (ep->state == SCIFEP_CONNECTED ||
 			    ep->state == SCIFEP_DISCONNECTED ||
 			    ep->conn_err)
-				mask |= POLLOUT;
+				mask |= EPOLLOUT;
 			goto exit;
 		}
 	}
@@ -1338,34 +1338,34 @@ __scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep)
 		_scif_poll_wait(f, &ep->conwq, wait, ep);
 		if (ep->state == SCIFEP_LISTENING) {
 			if (ep->conreqcnt)
-				mask |= POLLIN;
+				mask |= EPOLLIN;
 			goto exit;
 		}
 	}
 
 	/* Endpoint is connected or disconnected */
 	if (ep->state == SCIFEP_CONNECTED || ep->state == SCIFEP_DISCONNECTED) {
-		if (poll_requested_events(wait) & POLLIN)
+		if (poll_requested_events(wait) & EPOLLIN)
 			_scif_poll_wait(f, &ep->recvwq, wait, ep);
-		if (poll_requested_events(wait) & POLLOUT)
+		if (poll_requested_events(wait) & EPOLLOUT)
 			_scif_poll_wait(f, &ep->sendwq, wait, ep);
 		if (ep->state == SCIFEP_CONNECTED ||
 		    ep->state == SCIFEP_DISCONNECTED) {
 			/* Data can be read without blocking */
 			if (scif_rb_count(&ep->qp_info.qp->inbound_q, 1))
-				mask |= POLLIN;
+				mask |= EPOLLIN;
 			/* Data can be written without blocking */
 			if (scif_rb_space(&ep->qp_info.qp->outbound_q))
-				mask |= POLLOUT;
-			/* Return POLLHUP if endpoint is disconnected */
+				mask |= EPOLLOUT;
+			/* Return EPOLLHUP if endpoint is disconnected */
 			if (ep->state == SCIFEP_DISCONNECTED)
-				mask |= POLLHUP;
+				mask |= EPOLLHUP;
 			goto exit;
 		}
 	}
 
-	/* Return POLLERR if the endpoint is in none of the above states */
-	mask |= POLLERR;
+	/* Return EPOLLERR if the endpoint is in none of the above states */
+	mask |= EPOLLERR;
 exit:
 	spin_unlock(&ep->lock);
 	return mask;
@@ -1398,10 +1398,10 @@ scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs)
 	pt = &table.pt;
 	while (1) {
 		for (i = 0; i < nfds; i++) {
-			pt->_key = ufds[i].events | POLLERR | POLLHUP;
+			pt->_key = ufds[i].events | EPOLLERR | EPOLLHUP;
 			mask = __scif_pollfd(ufds[i].epd->anon,
 					     pt, ufds[i].epd);
-			mask &= ufds[i].events | POLLERR | POLLHUP;
+			mask &= ufds[i].events | EPOLLERR | EPOLLHUP;
 			if (mask) {
 				count++;
 				pt->_qproc = NULL;
diff --git a/drivers/misc/mic/vop/vop_vringh.c b/drivers/misc/mic/vop/vop_vringh.c
index 01d1f2ba7bb82..cbc8ebcff5cfe 100644
--- a/drivers/misc/mic/vop/vop_vringh.c
+++ b/drivers/misc/mic/vop/vop_vringh.c
@@ -1010,7 +1010,7 @@ static long vop_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 }
 
 /*
- * We return POLLIN | POLLOUT from poll when new buffers are enqueued, and
+ * We return EPOLLIN | EPOLLOUT from poll when new buffers are enqueued, and
  * not when previously enqueued buffers may be available. This means that
  * in the card->host (TX) path, when userspace is unblocked by poll it
  * must drain all available descriptors or it can stall.
@@ -1022,15 +1022,15 @@ static __poll_t vop_poll(struct file *f, poll_table *wait)
 
 	mutex_lock(&vdev->vdev_mutex);
 	if (vop_vdev_inited(vdev)) {
-		mask = POLLERR;
+		mask = EPOLLERR;
 		goto done;
 	}
 	poll_wait(f, &vdev->waitq, wait);
 	if (vop_vdev_inited(vdev)) {
-		mask = POLLERR;
+		mask = EPOLLERR;
 	} else if (vdev->poll_wake) {
 		vdev->poll_wake = 0;
-		mask = POLLIN | POLLOUT;
+		mask = EPOLLIN | EPOLLOUT;
 	}
 done:
 	mutex_unlock(&vdev->vdev_mutex);
diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c
index c90c1a578d2f1..d9aa407db06a1 100644
--- a/drivers/misc/ocxl/file.c
+++ b/drivers/misc/ocxl/file.c
@@ -215,9 +215,9 @@ static unsigned int afu_poll(struct file *file, struct poll_table_struct *wait)
 	mutex_unlock(&ctx->status_mutex);
 
 	if (afu_events_pending(ctx))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 	else if (closed)
-		mask = POLLERR;
+		mask = EPOLLERR;
 
 	return mask;
 }
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index 8fa68cf308e02..b084245f6238e 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -265,9 +265,9 @@ static __poll_t phantom_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &dev->wait, wait);
 
 	if (!(dev->status & PHB_RUNNING))
-		mask = POLLERR;
+		mask = EPOLLERR;
 	else if (atomic_read(&dev->counter))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 
 	pr_debug("phantom_poll end: %x/%d\n", mask, atomic_read(&dev->counter));
 
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 6640e76515339..83e0c95d20a47 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -182,7 +182,7 @@ static __poll_t vmci_host_poll(struct file *filp, poll_table *wait)
 		if (context->pending_datagrams > 0 ||
 		    vmci_handle_arr_get_size(
 				context->pending_doorbell_array) > 0) {
-			mask = POLLIN;
+			mask = EPOLLIN;
 		}
 		spin_unlock(&context->lock);
 	}
diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c
index e412dfdda7ddd..377af43b81b3d 100644
--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -2648,11 +2648,11 @@ static __poll_t ca8210_test_int_poll(
 
 	poll_wait(filp, &priv->test.readq, ptable);
 	if (!kfifo_is_empty(&priv->test.up_fifo))
-		return_flags |= (POLLIN | POLLRDNORM);
+		return_flags |= (EPOLLIN | EPOLLRDNORM);
 	if (wait_event_interruptible(
 		priv->test.readq,
 		!kfifo_is_empty(&priv->test.up_fifo))) {
-		return POLLERR;
+		return EPOLLERR;
 	}
 	return return_flags;
 }
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index ef6b2126b23a2..255a5def56e94 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -539,11 +539,11 @@ static __poll_t ppp_poll(struct file *file, poll_table *wait)
 	if (!pf)
 		return 0;
 	poll_wait(file, &pf->rwait, wait);
-	mask = POLLOUT | POLLWRNORM;
+	mask = EPOLLOUT | EPOLLWRNORM;
 	if (skb_peek(&pf->rq))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (pf->dead)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	else if (pf->kind == INTERFACE) {
 		/* see comment in ppp_read */
 		struct ppp *ppp = PF_TO_PPP(pf);
@@ -551,7 +551,7 @@ static __poll_t ppp_poll(struct file *file, poll_table *wait)
 		ppp_recv_lock(ppp);
 		if (ppp->n_channels == 0 &&
 		    (ppp->flags & SC_LOOP_TRAFFIC) == 0)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		ppp_recv_unlock(ppp);
 	}
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 0a5ed004781cb..9b6cb780affec 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -377,7 +377,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
 	}
 
 wake_up:
-	wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND);
+	wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND);
 	return RX_HANDLER_CONSUMED;
 
 drop:
@@ -487,7 +487,7 @@ static void tap_sock_write_space(struct sock *sk)
 
 	wqueue = sk_sleep(sk);
 	if (wqueue && waitqueue_active(wqueue))
-		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
+		wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 }
 
 static void tap_sock_destruct(struct sock *sk)
@@ -572,7 +572,7 @@ static int tap_release(struct inode *inode, struct file *file)
 static __poll_t tap_poll(struct file *file, poll_table *wait)
 {
 	struct tap_queue *q = file->private_data;
-	__poll_t mask = POLLERR;
+	__poll_t mask = EPOLLERR;
 
 	if (!q)
 		goto out;
@@ -581,12 +581,12 @@ static __poll_t tap_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &q->wq.wait, wait);
 
 	if (!ptr_ring_empty(&q->ring))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sock_writeable(&q->sk) ||
 	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) &&
 	     sock_writeable(&q->sk)))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 out:
 	return mask;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 17e496b88f812..81e6cc951e7fc 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1437,7 +1437,7 @@ static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
 	__poll_t mask = 0;
 
 	if (!tun)
-		return POLLERR;
+		return EPOLLERR;
 
 	sk = tfile->socket.sk;
 
@@ -1446,16 +1446,16 @@ static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
 	poll_wait(file, sk_sleep(sk), wait);
 
 	if (!ptr_ring_empty(&tfile->tx_ring))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (tun->dev->flags & IFF_UP &&
 	    (sock_writeable(sk) ||
 	     (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
 	      sock_writeable(sk))))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	if (tun->dev->reg_state != NETREG_REGISTERED)
-		mask = POLLERR;
+		mask = EPOLLERR;
 
 	tun_put(tun);
 	return mask;
@@ -2310,8 +2310,8 @@ static void tun_sock_write_space(struct sock *sk)
 
 	wqueue = sk_sleep(sk);
 	if (wqueue && waitqueue_active(wqueue))
-		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
-						POLLWRNORM | POLLWRBAND);
+		wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
+						EPOLLWRNORM | EPOLLWRBAND);
 
 	tfile = container_of(sk, struct tun_file, sk);
 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00debug.c b/drivers/net/wireless/ralink/rt2x00/rt2x00debug.c
index 72c55d1f8903e..ac2572943ed08 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00debug.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00debug.c
@@ -309,7 +309,7 @@ static __poll_t rt2x00debug_poll_queue_dump(struct file *file,
 	poll_wait(file, &intf->frame_dump_waitqueue, wait);
 
 	if (!skb_queue_empty(&intf->frame_dump_skbqueue))
-		return POLLOUT | POLLWRNORM;
+		return EPOLLOUT | EPOLLWRNORM;
 
 	return 0;
 }
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index a60c0ab7883d1..47cd0c037433d 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -511,15 +511,15 @@ static __poll_t switchtec_dev_poll(struct file *filp, poll_table *wait)
 	poll_wait(filp, &stdev->event_wq, wait);
 
 	if (lock_mutex_and_test_alive(stdev))
-		return POLLIN | POLLRDHUP | POLLOUT | POLLERR | POLLHUP;
+		return EPOLLIN | EPOLLRDHUP | EPOLLOUT | EPOLLERR | EPOLLHUP;
 
 	mutex_unlock(&stdev->mrpc_mutex);
 
 	if (try_wait_for_completion(&stuser->comp))
-		ret |= POLLIN | POLLRDNORM;
+		ret |= EPOLLIN | EPOLLRDNORM;
 
 	if (stuser->event_cnt != atomic_read(&stdev->event_cnt))
-		ret |= POLLPRI | POLLRDBAND;
+		ret |= EPOLLPRI | EPOLLRDBAND;
 
 	return ret;
 }
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 5473e602f7e0f..0e88e18362c10 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -200,7 +200,7 @@ static __poll_t cros_ec_console_log_poll(struct file *file,
 	if (CIRC_CNT(debug_info->log_buffer.head,
 		     debug_info->log_buffer.tail,
 		     LOG_SIZE))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	mutex_unlock(&debug_info->log_mutex);
 
 	return mask;
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 999f1152655ab..3e32a4c14d5fc 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -549,13 +549,13 @@ static __poll_t goldfish_pipe_poll(struct file *filp, poll_table *wait)
 		return -ERESTARTSYS;
 
 	if (status & PIPE_POLL_IN)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (status & PIPE_POLL_OUT)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (status & PIPE_POLL_HUP)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (test_bit(BIT_CLOSED_ON_HOST, &pipe->flags))
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 
 	return mask;
 }
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index a4fabf9d75f33..b205b037fd61e 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -4128,7 +4128,7 @@ static __poll_t sonypi_misc_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &sonypi_compat.fifo_proc_list, wait);
 	if (kfifo_len(&sonypi_compat.fifo))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index 1d42385b1aa55..8febacb8fc54d 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -55,7 +55,7 @@ static __poll_t pps_cdev_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &pps->queue, wait);
 
-	return POLLIN | POLLRDNORM;
+	return EPOLLIN | EPOLLRDNORM;
 }
 
 static int pps_cdev_fasync(int fd, struct file *file, int on)
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index a593b4cf47bf2..767c485af59b2 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -286,7 +286,7 @@ __poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait)
 
 	poll_wait(fp, &ptp->tsev_wq, wait);
 
-	return queue_cnt(&ptp->tsevq) ? POLLIN : 0;
+	return queue_cnt(&ptp->tsevq) ? EPOLLIN : 0;
 }
 
 #define EXTTS_BUFSIZE (PTP_BUF_TIMESTAMPS * sizeof(struct ptp_extts_event))
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 6092b3a5978e9..cfb54e01d758f 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -2325,7 +2325,7 @@ static __poll_t mport_cdev_poll(struct file *filp, poll_table *wait)
 
 	poll_wait(filp, &priv->event_rx_wait, wait);
 	if (kfifo_len(&priv->event_fifo))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c
index 8428eba8cb736..92d0c6a7a8372 100644
--- a/drivers/rpmsg/qcom_smd.c
+++ b/drivers/rpmsg/qcom_smd.c
@@ -967,7 +967,7 @@ static __poll_t qcom_smd_poll(struct rpmsg_endpoint *ept,
 	poll_wait(filp, &channel->fblockread_event, wait);
 
 	if (qcom_smd_get_tx_avail(channel) > 20)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	return mask;
 }
diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c
index e622fcda30fab..64b6de9763ee2 100644
--- a/drivers/rpmsg/rpmsg_char.c
+++ b/drivers/rpmsg/rpmsg_char.c
@@ -262,12 +262,12 @@ static __poll_t rpmsg_eptdev_poll(struct file *filp, poll_table *wait)
 	__poll_t mask = 0;
 
 	if (!eptdev->ept)
-		return POLLERR;
+		return EPOLLERR;
 
 	poll_wait(filp, &eptdev->readq, wait);
 
 	if (!skb_queue_empty(&eptdev->queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	mask |= rpmsg_poll(eptdev->ept, filp, wait);
 
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 5a7b30d0773bc..efa221e8bc22d 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -203,7 +203,7 @@ static __poll_t rtc_dev_poll(struct file *file, poll_table *wait)
 
 	data = rtc->irq_data;
 
-	return (data != 0) ? (POLLIN | POLLRDNORM) : 0;
+	return (data != 0) ? (EPOLLIN | EPOLLRDNORM) : 0;
 }
 
 static long rtc_dev_ioctl(struct file *file,
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index 0c075d100252b..fb2c3599d95c2 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -671,7 +671,7 @@ static __poll_t dasd_eer_poll(struct file *filp, poll_table *ptable)
 	poll_wait(filp, &dasd_eer_read_wait_queue, ptable);
 	spin_lock_irqsave(&bufferlock, flags);
 	if (eerb->head != eerb->tail)
-		mask = POLLIN | POLLRDNORM ;
+		mask = EPOLLIN | EPOLLRDNORM ;
 	else
 		mask = 0;
 	spin_unlock_irqrestore(&bufferlock, flags);
diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c
index 956f662908a65..7bc616b253f16 100644
--- a/drivers/s390/char/monreader.c
+++ b/drivers/s390/char/monreader.c
@@ -435,9 +435,9 @@ static __poll_t mon_poll(struct file *filp, struct poll_table_struct *p)
 
 	poll_wait(filp, &mon_read_wait_queue, p);
 	if (unlikely(atomic_read(&monpriv->iucv_severed)))
-		return POLLERR;
+		return EPOLLERR;
 	if (atomic_read(&monpriv->read_ready))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 2791141bd0356..a71ee67df0847 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -7041,7 +7041,7 @@ static __poll_t megasas_mgmt_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &megasas_poll_wait, wait);
 	spin_lock_irqsave(&poll_aen_lock, flags);
 	if (megasas_poll_wait_aen)
-		mask = (POLLIN | POLLRDNORM);
+		mask = (EPOLLIN | EPOLLRDNORM);
 	else
 		mask = 0;
 	megasas_poll_wait_aen = 0;
diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index 9cddc3074cd15..523971aeb4c17 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -546,7 +546,7 @@ _ctl_poll(struct file *filep, poll_table *wait)
 	list_for_each_entry(ioc, &mpt3sas_ioc_list, list) {
 		if (ioc->aen_event_read_flag) {
 			spin_unlock(&gioc_lock);
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 		}
 	}
 	spin_unlock(&gioc_lock);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0c434453aab38..c198b96368dd6 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1152,27 +1152,27 @@ sg_poll(struct file *filp, poll_table * wait)
 
 	sfp = filp->private_data;
 	if (!sfp)
-		return POLLERR;
+		return EPOLLERR;
 	sdp = sfp->parentdp;
 	if (!sdp)
-		return POLLERR;
+		return EPOLLERR;
 	poll_wait(filp, &sfp->read_wait, wait);
 	read_lock_irqsave(&sfp->rq_list_lock, iflags);
 	list_for_each_entry(srp, &sfp->rq_list, entry) {
 		/* if any read waiting, flag it */
 		if ((0 == res) && (1 == srp->done) && (!srp->sg_io_owned))
-			res = POLLIN | POLLRDNORM;
+			res = EPOLLIN | EPOLLRDNORM;
 		++count;
 	}
 	read_unlock_irqrestore(&sfp->rq_list_lock, iflags);
 
 	if (atomic_read(&sdp->detaching))
-		res |= POLLHUP;
+		res |= EPOLLHUP;
 	else if (!sfp->cmd_q) {
 		if (0 == count)
-			res |= POLLOUT | POLLWRNORM;
+			res |= EPOLLOUT | EPOLLWRNORM;
 	} else if (count < SG_MAX_QUEUE)
-		res |= POLLOUT | POLLWRNORM;
+		res |= EPOLLOUT | EPOLLWRNORM;
 	SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp,
 				      "sg_poll: res=0x%x\n", (__force u32) res));
 	return res;
diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c
index ef733847eebed..c13772a0df58a 100644
--- a/drivers/staging/comedi/comedi_fops.c
+++ b/drivers/staging/comedi/comedi_fops.c
@@ -2288,7 +2288,7 @@ static __poll_t comedi_poll(struct file *file, poll_table *wait)
 		if (s->busy != file || !comedi_is_subdevice_running(s) ||
 		    (s->async->cmd.flags & CMDF_WRITE) ||
 		    comedi_buf_read_n_available(s) > 0)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	s = comedi_file_write_subdevice(file);
@@ -2300,7 +2300,7 @@ static __poll_t comedi_poll(struct file *file, poll_table *wait)
 		if (s->busy != file || !comedi_is_subdevice_running(s) ||
 		    !(s->async->cmd.flags & CMDF_WRITE) ||
 		    comedi_buf_write_n_available(s) >= bps)
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 
 done:
diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index ab69eeb2c1f18..b3f3b4a201af1 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -114,8 +114,8 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
 		__poll_t mask;
 
 		mask = f->f_op->poll(f, &table.pt);
-		if (mask & (POLLRDNORM | POLLRDBAND | POLLIN |
-			    POLLHUP | POLLERR)) {
+		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
+			    EPOLLHUP | EPOLLERR)) {
 			break;
 		}
 		now = ktime_get();
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 1993b03a6f2d6..e8bfe5520bc79 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -37,7 +37,7 @@ module_param_named(loop, create_loop_dev, bool, 0644);
 /*
  * Threshold below which the tty is woken for writing
  * - should be equal to WAKEUP_CHARS in drivers/tty/n_tty.c because
- *   even if the writer is woken, n_tty_poll() won't set POLLOUT until
+ *   even if the writer is woken, n_tty_poll() won't set EPOLLOUT until
  *   our fifo is below this level
  */
 #define WAKEUP_CHARS             256
diff --git a/drivers/staging/greybus/tools/loopback_test.c b/drivers/staging/greybus/tools/loopback_test.c
index c51610ce24af8..b82e2befe9355 100644
--- a/drivers/staging/greybus/tools/loopback_test.c
+++ b/drivers/staging/greybus/tools/loopback_test.c
@@ -663,7 +663,7 @@ static int open_poll_files(struct loopback_test *t)
 			goto err;
 		}
 		read(t->fds[fds_idx].fd, &dummy, 1);
-		t->fds[fds_idx].events = POLLERR|POLLPRI;
+		t->fds[fds_idx].events = EPOLLERR|EPOLLPRI;
 		t->fds[fds_idx].revents = 0;
 		fds_idx++;
 	}
@@ -756,7 +756,7 @@ static int wait_for_complete(struct loopback_test *t)
 		}
 
 		for (i = 0; i < t->poll_count; i++) {
-			if (t->fds[i].revents & POLLPRI) {
+			if (t->fds[i].revents & EPOLLPRI) {
 				/* Dummy read to clear the event */
 				read(t->fds[i].fd, &dummy, 1);
 				number_of_events++;
diff --git a/drivers/staging/irda/net/af_irda.c b/drivers/staging/irda/net/af_irda.c
index f1d128b2dae9e..2f1e9ab3d6d0f 100644
--- a/drivers/staging/irda/net/af_irda.c
+++ b/drivers/staging/irda/net/af_irda.c
@@ -1749,16 +1749,16 @@ static __poll_t irda_poll(struct file * file, struct socket *sock,
 
 	/* Exceptional events? */
 	if (sk->sk_err)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	if (sk->sk_shutdown & RCV_SHUTDOWN) {
 		pr_debug("%s(), POLLHUP\n", __func__);
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	}
 
 	/* Readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
 		pr_debug("Socket is readable\n");
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	/* Connection-based need to check for termination and startup */
@@ -1766,14 +1766,14 @@ static __poll_t irda_poll(struct file * file, struct socket *sock,
 	case SOCK_STREAM:
 		if (sk->sk_state == TCP_CLOSE) {
 			pr_debug("%s(), POLLHUP\n", __func__);
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		}
 
 		if (sk->sk_state == TCP_ESTABLISHED) {
 			if ((self->tx_flow == FLOW_START) &&
 			    sock_writeable(sk))
 			{
-				mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+				mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 			}
 		}
 		break;
@@ -1781,12 +1781,12 @@ static __poll_t irda_poll(struct file * file, struct socket *sock,
 		if ((self->tx_flow == FLOW_START) &&
 		    sock_writeable(sk))
 		{
-			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+			mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 		}
 		break;
 	case SOCK_DGRAM:
 		if (sock_writeable(sk))
-			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+			mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 		break;
 	default:
 		break;
diff --git a/drivers/staging/irda/net/irnet/irnet_ppp.c b/drivers/staging/irda/net/irnet/irnet_ppp.c
index 75bf9e34311d5..c90a158af4b73 100644
--- a/drivers/staging/irda/net/irnet/irnet_ppp.c
+++ b/drivers/staging/irda/net/irnet/irnet_ppp.c
@@ -429,10 +429,10 @@ irnet_ctrl_poll(irnet_socket *	ap,
   DENTER(CTRL_TRACE, "(ap=0x%p)\n", ap);
 
   poll_wait(file, &irnet_events.rwait, wait);
-  mask = POLLOUT | POLLWRNORM;
+  mask = EPOLLOUT | EPOLLWRNORM;
   /* If there is unread events */
   if(ap->event_index != irnet_events.index)
-    mask |= POLLIN | POLLRDNORM;
+    mask |= EPOLLIN | EPOLLRDNORM;
 #ifdef INITIAL_DISCOVERY
   if(ap->disco_number != -1)
     {
@@ -441,7 +441,7 @@ irnet_ctrl_poll(irnet_socket *	ap,
 	irnet_get_discovery_log(ap);
       /* Recheck */
       if(ap->disco_number != -1)
-	mask |= POLLIN | POLLRDNORM;
+	mask |= EPOLLIN | EPOLLRDNORM;
     }
 #endif /* INITIAL_DISCOVERY */
 
@@ -618,7 +618,7 @@ dev_irnet_poll(struct file *	file,
   DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n",
 	 file, ap);
 
-  mask = POLLOUT | POLLWRNORM;
+  mask = EPOLLOUT | EPOLLWRNORM;
   DABORT(ap == NULL, mask, FS_ERROR, "ap is NULL !!!\n");
 
   /* If we are connected to ppp_generic, let it handle the job */
diff --git a/drivers/staging/media/atomisp/pci/atomisp2/atomisp_fops.c b/drivers/staging/media/atomisp/pci/atomisp2/atomisp_fops.c
index 6657ebbe068a1..4f9f9dca5e6a1 100644
--- a/drivers/staging/media/atomisp/pci/atomisp2/atomisp_fops.c
+++ b/drivers/staging/media/atomisp/pci/atomisp2/atomisp_fops.c
@@ -1265,7 +1265,7 @@ static __poll_t atomisp_poll(struct file *file,
 	rt_mutex_lock(&isp->mutex);
 	if (pipe->capq.streaming != 1) {
 		rt_mutex_unlock(&isp->mutex);
-		return POLLERR;
+		return EPOLLERR;
 	}
 	rt_mutex_unlock(&isp->mutex);
 
diff --git a/drivers/staging/media/bcm2048/radio-bcm2048.c b/drivers/staging/media/bcm2048/radio-bcm2048.c
index 4ffff6f8b809c..06d1920150da3 100644
--- a/drivers/staging/media/bcm2048/radio-bcm2048.c
+++ b/drivers/staging/media/bcm2048/radio-bcm2048.c
@@ -2183,7 +2183,7 @@ static __poll_t bcm2048_fops_poll(struct file *file,
 	poll_wait(file, &bdev->read_queue, pts);
 
 	if (bdev->rds_data_available)
-		retval = POLLIN | POLLRDNORM;
+		retval = EPOLLIN | EPOLLRDNORM;
 
 	return retval;
 }
diff --git a/drivers/staging/most/cdev/cdev.c b/drivers/staging/most/cdev/cdev.c
index c183489c4a1c5..4d7fce8731fe6 100644
--- a/drivers/staging/most/cdev/cdev.c
+++ b/drivers/staging/most/cdev/cdev.c
@@ -292,10 +292,10 @@ static __poll_t comp_poll(struct file *filp, poll_table *wait)
 
 	if (c->cfg->direction == MOST_CH_RX) {
 		if (!kfifo_is_empty(&c->fifo))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	} else {
 		if (!kfifo_is_empty(&c->fifo) || ch_has_mbo(c))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	return mask;
 }
diff --git a/drivers/staging/most/video/video.c b/drivers/staging/most/video/video.c
index ef23e8524b1e0..9d7e747519d9a 100644
--- a/drivers/staging/most/video/video.c
+++ b/drivers/staging/most/video/video.c
@@ -213,7 +213,7 @@ static __poll_t comp_vdev_poll(struct file *filp, poll_table *wait)
 	if (!data_ready(mdev))
 		poll_wait(filp, &mdev->wait_data, wait);
 	if (data_ready(mdev))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/drivers/staging/speakup/speakup_soft.c b/drivers/staging/speakup/speakup_soft.c
index 0e74d09e18ea3..0a1a7c259ab0a 100644
--- a/drivers/staging/speakup/speakup_soft.c
+++ b/drivers/staging/speakup/speakup_soft.c
@@ -325,7 +325,7 @@ static __poll_t softsynth_poll(struct file *fp, struct poll_table_struct *wait)
 
 	spin_lock_irqsave(&speakup_info.spinlock, flags);
 	if (!synth_buffer_empty() || speakup_info.flushing)
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 	spin_unlock_irqrestore(&speakup_info.spinlock, flags);
 	return ret;
 }
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 3b3af7e0ce1c7..3b3e1f6632d71 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2477,11 +2477,11 @@ static __poll_t gsmld_poll(struct tty_struct *tty, struct file *file,
 	poll_wait(file, &tty->read_wait, wait);
 	poll_wait(file, &tty->write_wait, wait);
 	if (tty_hung_up_p(file))
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (!tty_is_writelocked(tty) && tty_write_room(tty) > 0)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (gsm->dead)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	return mask;
 }
 
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index 929434ebee506..dabb391909aad 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -814,14 +814,14 @@ static __poll_t n_hdlc_tty_poll(struct tty_struct *tty, struct file *filp,
 
 		/* set bits for operations that won't block */
 		if (!list_empty(&n_hdlc->rx_buf_list.list))
-			mask |= POLLIN | POLLRDNORM;	/* readable */
+			mask |= EPOLLIN | EPOLLRDNORM;	/* readable */
 		if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		if (tty_hung_up_p(filp))
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		if (!tty_is_writelocked(tty) &&
 				!list_empty(&n_hdlc->tx_free_buf_list.list))
-			mask |= POLLOUT | POLLWRNORM;	/* writable */
+			mask |= EPOLLOUT | EPOLLWRNORM;	/* writable */
 	}
 	return mask;
 }	/* end of n_hdlc_tty_poll() */
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index e81d3db8ad632..dbf1ab36758eb 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -1223,7 +1223,7 @@ static __poll_t r3964_poll(struct tty_struct *tty, struct file *file,
 	struct r3964_client_info *pClient;
 	struct r3964_message *pMsg = NULL;
 	unsigned long flags;
-	__poll_t result = POLLOUT;
+	__poll_t result = EPOLLOUT;
 
 	TRACE_L("POLL");
 
@@ -1234,7 +1234,7 @@ static __poll_t r3964_poll(struct tty_struct *tty, struct file *file,
 		pMsg = pClient->first_msg;
 		spin_unlock_irqrestore(&pInfo->lock, flags);
 		if (pMsg)
-			result |= POLLIN | POLLRDNORM;
+			result |= EPOLLIN | EPOLLRDNORM;
 	} else {
 		result = -EINVAL;
 	}
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 478a9b40fd039..5c0e59e8fe46b 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1344,7 +1344,7 @@ n_tty_receive_char_special(struct tty_struct *tty, unsigned char c)
 			put_tty_queue(c, ldata);
 			smp_store_release(&ldata->canon_head, ldata->read_head);
 			kill_fasync(&tty->fasync, SIGIO, POLL_IN);
-			wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+			wake_up_interruptible_poll(&tty->read_wait, EPOLLIN);
 			return 0;
 		}
 	}
@@ -1625,7 +1625,7 @@ static void __receive_buf(struct tty_struct *tty, const unsigned char *cp,
 
 	if (read_cnt(ldata)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
-		wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+		wake_up_interruptible_poll(&tty->read_wait, EPOLLIN);
 	}
 }
 
@@ -2376,22 +2376,22 @@ static __poll_t n_tty_poll(struct tty_struct *tty, struct file *file,
 	poll_wait(file, &tty->read_wait, wait);
 	poll_wait(file, &tty->write_wait, wait);
 	if (input_available_p(tty, 1))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	else {
 		tty_buffer_flush_work(tty->port);
 		if (input_available_p(tty, 1))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	if (tty->packet && tty->link->ctrl_status)
-		mask |= POLLPRI | POLLIN | POLLRDNORM;
+		mask |= EPOLLPRI | EPOLLIN | EPOLLRDNORM;
 	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (tty_hung_up_p(file))
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (tty->ops->write && !tty_is_writelocked(tty) &&
 			tty_chars_in_buffer(tty) < WAKEUP_CHARS &&
 			tty_write_room(tty) > 0)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 }
 
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 64338442050ef..6c7151edd7155 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -344,7 +344,7 @@ static void pty_start(struct tty_struct *tty)
 		tty->ctrl_status &= ~TIOCPKT_STOP;
 		tty->ctrl_status |= TIOCPKT_START;
 		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-		wake_up_interruptible_poll(&tty->link->read_wait, POLLIN);
+		wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN);
 	}
 }
 
@@ -357,7 +357,7 @@ static void pty_stop(struct tty_struct *tty)
 		tty->ctrl_status &= ~TIOCPKT_START;
 		tty->ctrl_status |= TIOCPKT_STOP;
 		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-		wake_up_interruptible_poll(&tty->link->read_wait, POLLIN);
+		wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN);
 	}
 }
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 6a89835453d3b..eb9133b472f48 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -445,7 +445,7 @@ static ssize_t hung_up_tty_write(struct file *file, const char __user *buf,
 /* No kernel lock held - none needed ;) */
 static __poll_t hung_up_tty_poll(struct file *filp, poll_table *wait)
 {
-	return POLLIN | POLLOUT | POLLERR | POLLHUP | POLLRDNORM | POLLWRNORM;
+	return EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLRDNORM | EPOLLWRNORM;
 }
 
 static long hung_up_tty_ioctl(struct file *file, unsigned int cmd,
@@ -533,7 +533,7 @@ void tty_wakeup(struct tty_struct *tty)
 			tty_ldisc_deref(ld);
 		}
 	}
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT);
 }
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
@@ -867,7 +867,7 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 static void tty_write_unlock(struct tty_struct *tty)
 {
 	mutex_unlock(&tty->atomic_write_lock);
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT);
 }
 
 static int tty_write_lock(struct tty_struct *tty, int ndelay)
@@ -1667,21 +1667,21 @@ int tty_release(struct inode *inode, struct file *filp)
 
 		if (tty->count <= 1) {
 			if (waitqueue_active(&tty->read_wait)) {
-				wake_up_poll(&tty->read_wait, POLLIN);
+				wake_up_poll(&tty->read_wait, EPOLLIN);
 				do_sleep++;
 			}
 			if (waitqueue_active(&tty->write_wait)) {
-				wake_up_poll(&tty->write_wait, POLLOUT);
+				wake_up_poll(&tty->write_wait, EPOLLOUT);
 				do_sleep++;
 			}
 		}
 		if (o_tty && o_tty->count <= 1) {
 			if (waitqueue_active(&o_tty->read_wait)) {
-				wake_up_poll(&o_tty->read_wait, POLLIN);
+				wake_up_poll(&o_tty->read_wait, EPOLLIN);
 				do_sleep++;
 			}
 			if (waitqueue_active(&o_tty->write_wait)) {
-				wake_up_poll(&o_tty->write_wait, POLLOUT);
+				wake_up_poll(&o_tty->write_wait, EPOLLOUT);
 				do_sleep++;
 			}
 		}
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 4e7946c0484bf..050f4d6508917 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -735,8 +735,8 @@ void tty_ldisc_hangup(struct tty_struct *tty, bool reinit)
 		tty_ldisc_deref(ld);
 	}
 
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
-	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+	wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT);
+	wake_up_interruptible_poll(&tty->read_wait, EPOLLIN);
 
 	/*
 	 * Shutdown the current line discipline, and reset it to
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index 3e64ccd0040f8..e4a66e1fd05fb 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -563,7 +563,7 @@ static __poll_t
 vcs_poll(struct file *file, poll_table *wait)
 {
 	struct vcs_poll_data *poll = vcs_poll_data_get(file);
-	__poll_t ret = DEFAULT_POLLMASK|POLLERR|POLLPRI;
+	__poll_t ret = DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
 
 	if (poll) {
 		poll_wait(file, &poll->waitq, wait);
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 85bc1aaea4a42..fd4848392e0dd 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -506,7 +506,7 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait)
 
 	poll_wait(filep, &idev->wait, wait);
 	if (listener->event_count != atomic_read(&idev->event))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 9627ea6ec3aea..a0d284ef3f40a 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -603,16 +603,16 @@ static __poll_t wdm_poll(struct file *file, struct poll_table_struct *wait)
 
 	spin_lock_irqsave(&desc->iuspin, flags);
 	if (test_bit(WDM_DISCONNECTING, &desc->flags)) {
-		mask = POLLHUP | POLLERR;
+		mask = EPOLLHUP | EPOLLERR;
 		spin_unlock_irqrestore(&desc->iuspin, flags);
 		goto desc_out;
 	}
 	if (test_bit(WDM_READ, &desc->flags))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 	if (desc->rerr || desc->werr)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	if (!test_bit(WDM_IN_USE, &desc->flags))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	spin_unlock_irqrestore(&desc->iuspin, flags);
 
 	poll_wait(file, &desc->wait, wait);
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 425247b7f7281..d058d7a31e7c3 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -479,8 +479,8 @@ static __poll_t usblp_poll(struct file *file, struct poll_table_struct *wait)
 	poll_wait(file, &usblp->rwait, wait);
 	poll_wait(file, &usblp->wwait, wait);
 	spin_lock_irqsave(&usblp->lock, flags);
-	ret = ((usblp->bidir && usblp->rcomplete) ? POLLIN  | POLLRDNORM : 0) |
-	   ((usblp->no_paper || usblp->wcomplete) ? POLLOUT | POLLWRNORM : 0);
+	ret = ((usblp->bidir && usblp->rcomplete) ? EPOLLIN  | EPOLLRDNORM : 0) |
+	   ((usblp->no_paper || usblp->wcomplete) ? EPOLLOUT | EPOLLWRNORM : 0);
 	spin_unlock_irqrestore(&usblp->lock, flags);
 	return ret;
 }
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 7ea67a55be103..bdb1de0c0cef6 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -1265,13 +1265,13 @@ static __poll_t usbtmc_poll(struct file *file, poll_table *wait)
 	mutex_lock(&data->io_mutex);
 
 	if (data->zombie) {
-		mask = POLLHUP | POLLERR;
+		mask = EPOLLHUP | EPOLLERR;
 		goto no_poll;
 	}
 
 	poll_wait(file, &data->waitq, wait);
 
-	mask = (atomic_read(&data->srq_asserted)) ? POLLIN | POLLRDNORM : 0;
+	mask = (atomic_read(&data->srq_asserted)) ? EPOLLIN | EPOLLRDNORM : 0;
 
 no_poll:
 	mutex_unlock(&data->io_mutex);
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index e2cec448779e4..3de3c750b5f6e 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -632,7 +632,7 @@ static __poll_t usb_device_poll(struct file *file,
 	event_count = atomic_read(&device_event.count);
 	if (file->f_version != event_count) {
 		file->f_version = event_count;
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index bf00166cbee01..d526595bc959c 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -2578,11 +2578,11 @@ static __poll_t usbdev_poll(struct file *file,
 
 	poll_wait(file, &ps->wait, wait);
 	if (file->f_mode & FMODE_WRITE && !list_empty(&ps->async_completed))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (!connected(ps))
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (list_empty(&ps->list))
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	return mask;
 }
 
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 67564725e3710..8f2cf3baa19c1 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -644,7 +644,7 @@ static long ffs_ep0_ioctl(struct file *file, unsigned code, unsigned long value)
 static __poll_t ffs_ep0_poll(struct file *file, poll_table *wait)
 {
 	struct ffs_data *ffs = file->private_data;
-	__poll_t mask = POLLWRNORM;
+	__poll_t mask = EPOLLWRNORM;
 	int ret;
 
 	poll_wait(file, &ffs->ev.waitq, wait);
@@ -656,19 +656,19 @@ static __poll_t ffs_ep0_poll(struct file *file, poll_table *wait)
 	switch (ffs->state) {
 	case FFS_READ_DESCRIPTORS:
 	case FFS_READ_STRINGS:
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 		break;
 
 	case FFS_ACTIVE:
 		switch (ffs->setup_state) {
 		case FFS_NO_SETUP:
 			if (ffs->ev.count)
-				mask |= POLLIN;
+				mask |= EPOLLIN;
 			break;
 
 		case FFS_SETUP_PENDING:
 		case FFS_SETUP_CANCELLED:
-			mask |= (POLLIN | POLLOUT);
+			mask |= (EPOLLIN | EPOLLOUT);
 			break;
 		}
 	case FFS_CLOSING:
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index a73efb1c47d0a..54e859dcb25c3 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -422,10 +422,10 @@ static __poll_t f_hidg_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &hidg->write_queue, wait);
 
 	if (WRITE_COND)
-		ret |= POLLOUT | POLLWRNORM;
+		ret |= EPOLLOUT | EPOLLWRNORM;
 
 	if (READ_COND)
-		ret |= POLLIN | POLLRDNORM;
+		ret |= EPOLLIN | EPOLLRDNORM;
 
 	return ret;
 }
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index 453578c4af696..d359efe06c769 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -698,11 +698,11 @@ printer_poll(struct file *fd, poll_table *wait)
 
 	spin_lock_irqsave(&dev->lock, flags);
 	if (likely(!list_empty(&dev->tx_reqs)))
-		status |= POLLOUT | POLLWRNORM;
+		status |= EPOLLOUT | EPOLLWRNORM;
 
 	if (likely(dev->current_rx_bytes) ||
 			likely(!list_empty(&dev->rx_buffers)))
-		status |= POLLIN | POLLRDNORM;
+		status |= EPOLLIN | EPOLLRDNORM;
 
 	spin_unlock_irqrestore(&dev->lock, flags);
 
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 5960e76f4c751..37ca0e669bd85 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1225,16 +1225,16 @@ ep0_poll (struct file *fd, poll_table *wait)
        /* report fd mode change before acting on it */
        if (dev->setup_abort) {
                dev->setup_abort = 0;
-               mask = POLLHUP;
+               mask = EPOLLHUP;
                goto out;
        }
 
        if (dev->state == STATE_DEV_SETUP) {
                if (dev->setup_in || dev->setup_can_stall)
-                       mask = POLLOUT;
+                       mask = EPOLLOUT;
        } else {
                if (dev->ev_next != 0)
-                       mask = POLLIN;
+                       mask = EPOLLIN;
        }
 out:
        spin_unlock_irq(&dev->lock);
diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c
index 1fa00b35f4adb..8d33187ce2af3 100644
--- a/drivers/usb/misc/iowarrior.c
+++ b/drivers/usb/misc/iowarrior.c
@@ -683,19 +683,19 @@ static __poll_t iowarrior_poll(struct file *file, poll_table * wait)
 	__poll_t mask = 0;
 
 	if (!dev->present)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	poll_wait(file, &dev->read_wait, wait);
 	poll_wait(file, &dev->write_wait, wait);
 
 	if (!dev->present)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	if (read_index(dev) != -1)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (atomic_read(&dev->write_busy) < MAX_WRITES_IN_FLIGHT)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 }
 
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 074398c1e410f..63b9e85dc0e93 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -417,15 +417,15 @@ static __poll_t ld_usb_poll(struct file *file, poll_table *wait)
 	dev = file->private_data;
 
 	if (!dev->intf)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	poll_wait(file, &dev->read_wait, wait);
 	poll_wait(file, &dev->write_wait, wait);
 
 	if (dev->ring_head != dev->ring_tail)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (!dev->interrupt_out_busy)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	return mask;
 }
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index 941c45028828d..bf47bd8bc76f7 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -517,17 +517,17 @@ static __poll_t tower_poll (struct file *file, poll_table *wait)
 	dev = file->private_data;
 
 	if (!dev->udev)
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	poll_wait(file, &dev->read_wait, wait);
 	poll_wait(file, &dev->write_wait, wait);
 
 	tower_check_for_read_packet(dev);
 	if (dev->read_packet_length > 0) {
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	if (!dev->interrupt_out_busy) {
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 
 	return mask;
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index cc5b296bff3fd..2761fad66b95e 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -1203,7 +1203,7 @@ mon_bin_poll(struct file *file, struct poll_table_struct *wait)
 
 	spin_lock_irqsave(&rp->b_lock, flags);
 	if (!MON_RING_EMPTY(rp))
-		mask |= POLLIN | POLLRDNORM;    /* readable */
+		mask |= EPOLLIN | EPOLLRDNORM;    /* readable */
 	spin_unlock_irqrestore(&rp->b_lock, flags);
 	return mask;
 }
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 8cc4b48ff1273..085700f1be100 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -48,7 +48,7 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void
 	struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
 	__poll_t flags = key_to_poll(key);
 
-	if (flags & POLLIN) {
+	if (flags & EPOLLIN) {
 		/* An event has been signaled, call function */
 		if ((!virqfd->handler ||
 		     virqfd->handler(virqfd->opaque, virqfd->data)) &&
@@ -56,7 +56,7 @@ static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void
 			schedule_work(&virqfd->inject);
 	}
 
-	if (flags & POLLHUP) {
+	if (flags & EPOLLHUP) {
 		unsigned long flags;
 		spin_lock_irqsave(&virqfd_lock, flags);
 
@@ -172,14 +172,14 @@ int vfio_virqfd_enable(void *opaque,
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered and trigger it as if we didn't miss it.
 	 */
-	if (events & POLLIN) {
+	if (events & EPOLLIN) {
 		if ((!handler || handler(opaque, data)) && thread)
 			schedule_work(&virqfd->inject);
 	}
 
 	/*
 	 * Do not drop the file until the irqfd is fully initialized,
-	 * otherwise we might race against the POLLHUP.
+	 * otherwise we might race against the EPOLLHUP.
 	 */
 	fdput(irqfd);
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c613d2e3d371c..610cba276d476 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -952,8 +952,8 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	}
 	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
 
-	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
-	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
+	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
 
 	f->private_data = n;
 
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2db5af8e8652a..1b3e8d2d5c8b4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -211,7 +211,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 	mask = file->f_op->poll(file, &poll->table);
 	if (mask)
 		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
-	if (mask & POLLERR) {
+	if (mask & EPOLLERR) {
 		if (poll->wqh)
 			remove_wait_queue(poll->wqh, &poll->wait);
 		ret = -EINVAL;
@@ -440,7 +440,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 		vhost_vq_reset(dev, vq);
 		if (vq->handle_kick)
 			vhost_poll_init(&vq->poll, vq->handle_kick,
-					POLLIN, dev);
+					EPOLLIN, dev);
 	}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
@@ -630,7 +630,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 	vhost_umem_clean(dev->iotlb);
 	dev->iotlb = NULL;
 	vhost_clear_msg(dev);
-	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
 	WARN_ON(!llist_empty(&dev->work_list));
 	if (dev->worker) {
 		kthread_stop(dev->worker);
@@ -1057,7 +1057,7 @@ __poll_t vhost_chr_poll(struct file *file, struct vhost_dev *dev,
 	poll_wait(file, &dev->wait, wait);
 
 	if (!list_empty(&dev->read_list))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
@@ -2356,7 +2356,7 @@ void vhost_enqueue_msg(struct vhost_dev *dev, struct list_head *head,
 	list_add_tail(&node->node, head);
 	spin_unlock(&dev->iotlb_lock);
 
-	wake_up_interruptible_poll(&dev->wait, POLLIN | POLLRDNORM);
+	wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
 }
 EXPORT_SYMBOL_GPL(vhost_enqueue_msg);
 
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index b0597bef45551..4e05d7f711fef 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -574,7 +574,7 @@ static __poll_t fsl_hv_poll(struct file *filp, struct poll_table_struct *p)
 	spin_lock_irqsave(&dbq->lock, flags);
 
 	poll_wait(filp, &dbq->wait, p);
-	mask = (dbq->head == dbq->tail) ? 0 : (POLLIN | POLLRDNORM);
+	mask = (dbq->head == dbq->tail) ? 0 : (EPOLLIN | EPOLLRDNORM);
 
 	spin_unlock_irqrestore(&dbq->lock, flags);
 
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 72c0416a01cc3..8cac07ab60abd 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -623,14 +623,14 @@ static long evtchn_ioctl(struct file *file,
 
 static __poll_t evtchn_poll(struct file *file, poll_table *wait)
 {
-	__poll_t mask = POLLOUT | POLLWRNORM;
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
 	struct per_user_data *u = file->private_data;
 
 	poll_wait(file, &u->evtchn_wait, wait);
 	if (u->ring_cons != u->ring_prod)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (u->ring_overflow)
-		mask = POLLERR;
+		mask = EPOLLERR;
 	return mask;
 }
 
diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c
index 9ade533d9e406..262835ace35d4 100644
--- a/drivers/xen/mcelog.c
+++ b/drivers/xen/mcelog.c
@@ -144,7 +144,7 @@ static __poll_t xen_mce_chrdev_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &xen_mce_chrdev_wait, wait);
 
 	if (xen_mcelog.next)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 78804e71f9a6b..753d9cb437d02 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -892,7 +892,7 @@ static __poll_t pvcalls_front_poll_passive(struct file *file,
 
 		if (req_id != PVCALLS_INVALID_ID &&
 		    READ_ONCE(bedata->rsp[req_id].req_id) == req_id)
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 
 		poll_wait(file, &map->passive.inflight_accept_req, wait);
 		return 0;
@@ -900,7 +900,7 @@ static __poll_t pvcalls_front_poll_passive(struct file *file,
 
 	if (test_and_clear_bit(PVCALLS_FLAG_POLL_RET,
 			       (void *)&map->passive.flags))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	/*
 	 * First check RET, then INFLIGHT. No barriers necessary to
@@ -949,11 +949,11 @@ static __poll_t pvcalls_front_poll_active(struct file *file,
 
 	poll_wait(file, &map->active.inflight_conn_req, wait);
 	if (pvcalls_front_write_todo(map))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	if (pvcalls_front_read_todo(map))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (in_error != 0 || out_error != 0)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 
 	return mask;
 }
@@ -968,14 +968,14 @@ __poll_t pvcalls_front_poll(struct file *file, struct socket *sock,
 	pvcalls_enter();
 	if (!pvcalls_front_dev) {
 		pvcalls_exit();
-		return POLLNVAL;
+		return EPOLLNVAL;
 	}
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
 	map = (struct sock_mapping *) sock->sk->sk_send_head;
 	if (!map) {
 		pvcalls_exit();
-		return POLLNVAL;
+		return EPOLLNVAL;
 	}
 	if (map->active_socket)
 		ret = pvcalls_front_poll_active(file, bedata, map, wait);
diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c
index e17ec3fce590e..a493e99bed213 100644
--- a/drivers/xen/xenbus/xenbus_dev_frontend.c
+++ b/drivers/xen/xenbus/xenbus_dev_frontend.c
@@ -651,7 +651,7 @@ static __poll_t xenbus_file_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &u->read_waitq, wait);
 	if (!list_empty(&u->read_buffers))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 7edbd06799520..3fdee214a5bbc 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -289,7 +289,7 @@ static ssize_t cachefiles_daemon_write(struct file *file,
 
 /*
  * poll for culling state
- * - use POLLOUT to indicate culling state
+ * - use EPOLLOUT to indicate culling state
  */
 static __poll_t cachefiles_daemon_poll(struct file *file,
 					   struct poll_table_struct *poll)
@@ -301,10 +301,10 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
 	mask = 0;
 
 	if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	if (test_bit(CACHEFILES_CULLING, &cache->flags))
-		mask |= POLLOUT;
+		mask |= EPOLLOUT;
 
 	return mask;
 }
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 80b9b84391a9d..c5234c21b5394 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -64,12 +64,12 @@ static struct class *coda_psdev_class;
 static __poll_t coda_psdev_poll(struct file *file, poll_table * wait)
 {
         struct venus_comm *vcp = (struct venus_comm *) file->private_data;
-	__poll_t mask = POLLOUT | POLLWRNORM;
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
 
 	poll_wait(file, &vcp->vc_waitq, wait);
 	mutex_lock(&vcp->vc_mutex);
 	if (!list_empty(&vcp->vc_pending))
-                mask |= POLLIN | POLLRDNORM;
+                mask |= EPOLLIN | EPOLLRDNORM;
 	mutex_unlock(&vcp->vc_mutex);
 
 	return mask;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 20bb73a931dd2..1f99678ff5d3e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -214,7 +214,7 @@ static __poll_t full_proxy_poll(struct file *filp,
 	const struct file_operations *real_fops;
 
 	if (debugfs_file_get(dentry))
-		return POLLHUP;
+		return EPOLLHUP;
 
 	real_fops = debugfs_real_fops(filp);
 	r = real_fops->poll(filp, wait);
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index a4c63e9e63854..c7d5a2ea3d030 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -471,7 +471,7 @@ static __poll_t dev_poll(struct file *file, poll_table *wait)
 
 	spin_lock(&ops_lock);
 	if (!list_empty(&send_list))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&ops_lock);
 
 	return mask;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 662432af8ce88..2a669390cd7f6 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -896,7 +896,7 @@ static __poll_t device_poll(struct file *file, poll_table *wait)
 	spin_lock(&proc->asts_spin);
 	if (!list_empty(&proc->asts)) {
 		spin_unlock(&proc->asts_spin);
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	}
 	spin_unlock(&proc->asts_spin);
 	return 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 7423e792a092e..2d1158e5f9505 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -59,7 +59,7 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 	poll_wait(file, &daemon->wait, pt);
 	mutex_lock(&daemon->mux);
 	if (!list_empty(&daemon->msg_ctx_out_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 out_unlock_daemon:
 	daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL;
 	mutex_unlock(&daemon->mux);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 04fd824142a12..012f5bd46dfa1 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -45,7 +45,7 @@ struct eventfd_ctx {
  *
  * This function is supposed to be called by the kernel in paths that do not
  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a POLLERR
+ * value, and we signal this as overflow condition by returning a EPOLLERR
  * to poll(2).
  *
  * Returns the amount by which the counter was incremented.  This will be less
@@ -60,7 +60,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 		n = ULLONG_MAX - ctx->count;
 	ctx->count += n;
 	if (waitqueue_active(&ctx->wqh))
-		wake_up_locked_poll(&ctx->wqh, POLLIN);
+		wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return n;
@@ -96,7 +96,7 @@ static int eventfd_release(struct inode *inode, struct file *file)
 {
 	struct eventfd_ctx *ctx = file->private_data;
 
-	wake_up_poll(&ctx->wqh, POLLHUP);
+	wake_up_poll(&ctx->wqh, EPOLLHUP);
 	eventfd_ctx_put(ctx);
 	return 0;
 }
@@ -150,11 +150,11 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	count = READ_ONCE(ctx->count);
 
 	if (count > 0)
-		events |= POLLIN;
+		events |= EPOLLIN;
 	if (count == ULLONG_MAX)
-		events |= POLLERR;
+		events |= EPOLLERR;
 	if (ULLONG_MAX - 1 > count)
-		events |= POLLOUT;
+		events |= EPOLLOUT;
 
 	return events;
 }
@@ -187,7 +187,7 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
 	eventfd_ctx_do_read(ctx, cnt);
 	__remove_wait_queue(&ctx->wqh, wait);
 	if (*cnt != 0 && waitqueue_active(&ctx->wqh))
-		wake_up_locked_poll(&ctx->wqh, POLLOUT);
+		wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return *cnt != 0 ? 0 : -EAGAIN;
@@ -231,7 +231,7 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 	if (likely(res > 0)) {
 		eventfd_ctx_do_read(ctx, &ucnt);
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked_poll(&ctx->wqh, POLLOUT);
+			wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
@@ -281,7 +281,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	if (likely(res > 0)) {
 		ctx->count += ucnt;
 		if (waitqueue_active(&ctx->wqh))
-			wake_up_locked_poll(&ctx->wqh, POLLIN);
+			wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d1a490c7e6c3c..0f3494ed3ed0c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -95,9 +95,9 @@
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
 
-#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
 
-#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
 				EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
 
 /* Maximum number of nesting allowed inside epoll sets */
@@ -555,7 +555,7 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
 	wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie;
 
 	spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1);
-	wake_up_locked_poll(wqueue, POLLIN);
+	wake_up_locked_poll(wqueue, EPOLLIN);
 	spin_unlock_irqrestore(&wqueue->lock, flags);
 
 	return 0;
@@ -575,7 +575,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 
 static void ep_poll_safewake(wait_queue_head_t *wq)
 {
-	wake_up_poll(wq, POLLIN);
+	wake_up_poll(wq, EPOLLIN);
 }
 
 #endif
@@ -908,7 +908,7 @@ static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head
 
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
 		if (ep_item_poll(epi, &pt, depth)) {
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 		} else {
 			/*
 			 * Item has been dropped into the ready list by the poll
@@ -1181,12 +1181,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 		if ((epi->event.events & EPOLLEXCLUSIVE) &&
 					!(pollflags & POLLFREE)) {
 			switch (pollflags & EPOLLINOUT_BITS) {
-			case POLLIN:
-				if (epi->event.events & POLLIN)
+			case EPOLLIN:
+				if (epi->event.events & EPOLLIN)
 					ewake = 1;
 				break;
-			case POLLOUT:
-				if (epi->event.events & POLLOUT)
+			case EPOLLOUT:
+				if (epi->event.events & EPOLLOUT)
 					ewake = 1;
 				break;
 			case 0:
@@ -2105,7 +2105,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
-			epds.events |= POLLERR | POLLHUP;
+			epds.events |= EPOLLERR | EPOLLHUP;
 			error = ep_insert(ep, &epds, tf.file, fd, full_check);
 		} else
 			error = -EEXIST;
@@ -2121,7 +2121,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	case EPOLL_CTL_MOD:
 		if (epi) {
 			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
-				epds.events |= POLLERR | POLLHUP;
+				epds.events |= EPOLLERR | EPOLLHUP;
 				error = ep_modify(ep, epi, &epds);
 			}
 		} else
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4fc731876d6ba..1e97f1fda90c9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -691,12 +691,12 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
 /* Table to convert sigio signal codes into poll band bitmaps */
 
 static const __poll_t band_table[NSIGPOLL] = {
-	POLLIN | POLLRDNORM,			/* POLL_IN */
-	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
-	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
-	POLLERR,				/* POLL_ERR */
-	POLLPRI | POLLRDBAND,			/* POLL_PRI */
-	POLLHUP | POLLERR			/* POLL_HUP */
+	EPOLLIN | EPOLLRDNORM,			/* POLL_IN */
+	EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,	/* POLL_OUT */
+	EPOLLIN | EPOLLRDNORM | EPOLLMSG,		/* POLL_MSG */
+	EPOLLERR,				/* POLL_ERR */
+	EPOLLPRI | EPOLLRDBAND,			/* POLL_PRI */
+	EPOLLHUP | EPOLLERR			/* POLL_HUP */
 };
 
 static inline int sigio_perm(struct task_struct *p,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index aa089a6925d0e..5d06384c2cae3 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2006,21 +2006,21 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 
 static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
 {
-	__poll_t mask = POLLOUT | POLLWRNORM;
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
 	struct fuse_iqueue *fiq;
 	struct fuse_dev *fud = fuse_get_dev(file);
 
 	if (!fud)
-		return POLLERR;
+		return EPOLLERR;
 
 	fiq = &fud->fc->iq;
 	poll_wait(file, &fiq->waitq, wait);
 
 	spin_lock(&fiq->waitq.lock);
 	if (!fiq->connected)
-		mask = POLLERR;
+		mask = EPOLLERR;
 	else if (request_pending(fiq))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&fiq->waitq.lock);
 
 	return mask;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e85e974dd211d..a201fb0ac64f9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2791,7 +2791,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
 		fc->no_poll = 1;
 		return DEFAULT_POLLMASK;
 	}
-	return POLLERR;
+	return EPOLLERR;
 }
 EXPORT_SYMBOL_GPL(fuse_file_poll);
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index a03ce34225787..fd5ce883072e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -823,7 +823,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn)
  * the content and then you use 'poll' or 'select' to wait for
  * the content to change.  When the content changes (assuming the
  * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
+ * return EPOLLERR|EPOLLPRI, and select will return the fd whether
  * it is waiting for read, write, or exceptions.
  * Once poll/select indicates that the value has changed, you
  * need to close and re-open the file, or seek to 0 and read again.
@@ -851,7 +851,7 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
 	return DEFAULT_POLLMASK;
 
  trigger:
-	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+	return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
 }
 
 static void kernfs_notify_workfn(struct work_struct *work)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ef08d64c84b86..c07eb3d655eae 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -247,7 +247,7 @@ static __poll_t fanotify_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &group->notification_waitq, wait);
 	spin_lock(&group->notification_lock);
 	if (!fsnotify_notify_queue_is_empty(group))
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&group->notification_lock);
 
 	return ret;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5c29bf16814f4..2c908b31d6c93 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -115,7 +115,7 @@ static __poll_t inotify_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &group->notification_waitq, wait);
 	spin_lock(&group->notification_lock);
 	if (!fsnotify_notify_queue_is_empty(group))
-		ret = POLLIN | POLLRDNORM;
+		ret = EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&group->notification_lock);
 
 	return ret;
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 385fcefa8bc53..602c71f327409 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -71,7 +71,7 @@ struct workqueue_struct *user_dlm_worker;
  * Over time, dlmfs has added some features that were not part of the
  * initial ABI.  Unfortunately, some of these features are not detectable
  * via standard usage.  For example, Linux's default poll always returns
- * POLLIN, so there is no way for a caller of poll(2) to know when dlmfs
+ * EPOLLIN, so there is no way for a caller of poll(2) to know when dlmfs
  * added poll support.  Instead, we provide this list of new capabilities.
  *
  * Capabilities is a read-only attribute.  We do it as a module parameter
@@ -83,7 +83,7 @@ struct workqueue_struct *user_dlm_worker;
  * interaction.
  *
  * Capabilities:
- * - bast	: POLLIN against the file descriptor of a held lock
+ * - bast	: EPOLLIN against the file descriptor of a held lock
  *		  signifies a bast fired on the lock.
  */
 #define DLMFS_CAPABILITIES "bast stackglue"
@@ -230,7 +230,7 @@ static __poll_t dlmfs_file_poll(struct file *file, poll_table *wait)
 
 	spin_lock(&ip->ip_lockres.l_lock);
 	if (ip->ip_lockres.l_flags & USER_LOCK_BLOCKED)
-		event = POLLIN | POLLRDNORM;
+		event = EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&ip->ip_lockres.l_lock);
 
 	return event;
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index f073cd9e6687b..b03057afac2a0 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -823,7 +823,7 @@ static __poll_t orangefs_devreq_poll(struct file *file,
 	poll_wait(file, &orangefs_request_list_waitq, poll_table);
 
 	if (!list_empty(&orangefs_request_list))
-		poll_revent_mask |= POLLIN;
+		poll_revent_mask |= EPOLLIN;
 	return poll_revent_mask;
 }
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 0913aed7fd0de..7b1954caf3885 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -327,7 +327,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
 		pipe_wait(pipe);
@@ -336,7 +336,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
+		wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
@@ -463,7 +463,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
@@ -474,7 +474,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 out:
 	__pipe_unlock(pipe);
 	if (do_wakeup) {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
+		wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
@@ -523,19 +523,19 @@ pipe_poll(struct file *filp, poll_table *wait)
 	nrbufs = pipe->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
-		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
+		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
+		mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
 		/*
-		 * Most Unices do not set POLLERR for FIFOs but on Linux they
+		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
 		 */
 		if (!pipe->readers)
-			mask |= POLLERR;
+			mask |= EPOLLERR;
 	}
 
 	return mask;
@@ -568,7 +568,7 @@ pipe_release(struct inode *inode, struct file *file)
 		pipe->writers--;
 
 	if (pipe->readers || pipe->writers) {
-		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
+		wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
@@ -936,7 +936,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 
 		if (!is_pipe && !pipe->writers) {
 			if ((filp->f_flags & O_NONBLOCK)) {
-				/* suppress POLLHUP until we have
+				/* suppress EPOLLHUP until we have
 				 * seen a writer */
 				filp->f_version = pipe->w_counter;
 			} else {
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index f0bfb45c3f9fe..4f4a2abb225eb 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -44,7 +44,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &log_wait, wait);
 	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 63325377621a9..c41ab261397df 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -640,7 +640,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
 
 	/* sysctl was unregistered */
 	if (IS_ERR(head))
-		return POLLERR | POLLHUP;
+		return EPOLLERR | EPOLLHUP;
 
 	if (!table->proc_handler)
 		goto out;
@@ -653,7 +653,7 @@ static __poll_t proc_sys_poll(struct file *filp, poll_table *wait)
 
 	if (event != atomic_read(&table->poll->event)) {
 		filp->private_data = proc_sys_poll_event(table->poll);
-		ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+		ret = EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
 	}
 
 out:
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index c8528d587e09d..e16fb8f2049e7 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,7 +23,7 @@ static __poll_t mounts_poll(struct file *file, poll_table *wait)
 	struct seq_file *m = file->private_data;
 	struct proc_mounts *p = m->private;
 	struct mnt_namespace *ns = p->ns;
-	__poll_t res = POLLIN | POLLRDNORM;
+	__poll_t res = EPOLLIN | EPOLLRDNORM;
 	int event;
 
 	poll_wait(file, &p->ns->poll, wait);
@@ -31,7 +31,7 @@ static __poll_t mounts_poll(struct file *file, poll_table *wait)
 	event = READ_ONCE(ns->event);
 	if (m->poll_event != event) {
 		m->poll_event = event;
-		res |= POLLERR | POLLPRI;
+		res |= EPOLLERR | EPOLLPRI;
 	}
 
 	return res;
diff --git a/fs/select.c b/fs/select.c
index ec14171dd78a3..b6c36254028ad 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -432,9 +432,9 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
 	return max;
 }
 
-#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
-#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
-#define POLLEX_SET (POLLPRI)
+#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)
+#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR)
+#define POLLEX_SET (EPOLLPRI)
 
 static inline void wait_key_set(poll_table *wait, unsigned long in,
 				unsigned long out, unsigned long bit,
@@ -814,11 +814,11 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 	fd = pollfd->fd;
 	if (fd >= 0) {
 		struct fd f = fdget(fd);
-		mask = POLLNVAL;
+		mask = EPOLLNVAL;
 		if (f.file) {
 			/* userland u16 ->events contains POLL... bitmap */
 			__poll_t filter = demangle_poll(pollfd->events) |
-						POLLERR | POLLHUP;
+						EPOLLERR | EPOLLHUP;
 			mask = DEFAULT_POLLMASK;
 			if (f.file->f_op->poll) {
 				pwait->_key = filter;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 31e923bec99af..9990957264e3c 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -45,7 +45,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
 		return;
 
 	/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
-	wake_up_poll(wqh, POLLHUP | POLLFREE);
+	wake_up_poll(wqh, EPOLLHUP | POLLFREE);
 }
 
 struct signalfd_ctx {
@@ -69,7 +69,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
 	if (next_signal(&current->pending, &ctx->sigmask) ||
 	    next_signal(&current->signal->shared_pending,
 			&ctx->sigmask))
-		events |= POLLIN;
+		events |= EPOLLIN;
 	spin_unlock_irq(&current->sighand->siglock);
 
 	return events;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0510717f3a535..cdad49da3ff71 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -237,7 +237,7 @@ static __poll_t timerfd_poll(struct file *file, poll_table *wait)
 
 	spin_lock_irqsave(&ctx->wqh.lock, flags);
 	if (ctx->ticks)
-		events |= POLLIN;
+		events |= EPOLLIN;
 	spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
 	return events;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 87a13a7c8270b..cec550c8468f4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -483,7 +483,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	if (likely(must_wait && !READ_ONCE(ctx->released) &&
 		   (return_to_userland ? !signal_pending(current) :
 		    !fatal_signal_pending(current)))) {
-		wake_up_poll(&ctx->fd_wqh, POLLIN);
+		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 		schedule();
 		ret |= VM_FAULT_MAJOR;
 
@@ -614,7 +614,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 
 		spin_unlock(&ctx->event_wqh.lock);
 
-		wake_up_poll(&ctx->fd_wqh, POLLIN);
+		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
 		schedule();
 
 		spin_lock(&ctx->event_wqh.lock);
@@ -904,7 +904,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	/* Flush pending events that may still wait on event_wqh */
 	wake_up_all(&ctx->event_wqh);
 
-	wake_up_poll(&ctx->fd_wqh, POLLHUP);
+	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
 	userfaultfd_ctx_put(ctx);
 	return 0;
 }
@@ -949,14 +949,14 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 
 	switch (ctx->state) {
 	case UFFD_STATE_WAIT_API:
-		return POLLERR;
+		return EPOLLERR;
 	case UFFD_STATE_RUNNING:
 		/*
 		 * poll() never guarantees that read won't block.
 		 * userfaults can be waken before they're read().
 		 */
 		if (unlikely(!(file->f_flags & O_NONBLOCK)))
-			return POLLERR;
+			return EPOLLERR;
 		/*
 		 * lockless access to see if there are pending faults
 		 * __pollwait last action is the add_wait_queue but
@@ -970,14 +970,14 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 		ret = 0;
 		smp_mb();
 		if (waitqueue_active(&ctx->fault_pending_wqh))
-			ret = POLLIN;
+			ret = EPOLLIN;
 		else if (waitqueue_active(&ctx->event_wqh))
-			ret = POLLIN;
+			ret = EPOLLIN;
 
 		return ret;
 	default:
 		WARN_ON_ONCE(1);
-		return POLLERR;
+		return EPOLLERR;
 	}
 }
 
diff --git a/include/linux/scif.h b/include/linux/scif.h
index 7046111b8d0aa..eeb250b73c4b9 100644
--- a/include/linux/scif.h
+++ b/include/linux/scif.h
@@ -1266,8 +1266,8 @@ int scif_put_pages(struct scif_range *pages);
  * events is a bitmask specifying the events which the application is
  * interested in. The field revents is an output parameter, filled by the
  * kernel with the events that actually occurred. The bits returned in revents
- * can include any of those specified in events, or one of the values POLLERR,
- * POLLHUP, or POLLNVAL. (These three bits are meaningless in the events
+ * can include any of those specified in events, or one of the values EPOLLERR,
+ * EPOLLHUP, or EPOLLNVAL. (These three bits are meaningless in the events
  * field, and will be set in the revents field whenever the corresponding
  * condition is true.)
  *
@@ -1279,20 +1279,20 @@ int scif_put_pages(struct scif_range *pages);
  * timeout means an infinite timeout.
  *
  * The following bits may be set in events and returned in revents.
- * POLLIN - Data may be received without blocking. For a connected
+ * EPOLLIN - Data may be received without blocking. For a connected
  * endpoint, this means that scif_recv() may be called without blocking. For a
  * listening endpoint, this means that scif_accept() may be called without
  * blocking.
- * POLLOUT - Data may be sent without blocking. For a connected endpoint, this
- * means that scif_send() may be called without blocking. POLLOUT may also be
+ * EPOLLOUT - Data may be sent without blocking. For a connected endpoint, this
+ * means that scif_send() may be called without blocking. EPOLLOUT may also be
  * used to block waiting for a non-blocking connect to complete. This bit value
  * has no meaning for a listening endpoint and is ignored if specified.
  *
  * The following bits are only returned in revents, and are ignored if set in
  * events.
- * POLLERR - An error occurred on the endpoint
- * POLLHUP - The connection to the peer endpoint was disconnected
- * POLLNVAL - The specified endpoint descriptor is invalid.
+ * EPOLLERR - An error occurred on the endpoint
+ * EPOLLHUP - The connection to the peer endpoint was disconnected
+ * EPOLLNVAL - The specified endpoint descriptor is invalid.
  *
  * Return:
  * Upon successful completion, scif_poll() returns a non-negative value. A
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index aa16c064294f0..5b6c541e4e1b1 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -443,7 +443,7 @@ struct vb2_buf_ops {
  * @fileio_read_once:		report EOF after reading the first buffer
  * @fileio_write_immediately:	queue buffer after each write() call
  * @allow_zero_bytesused:	allow bytesused == 0 to be passed to the driver
- * @quirk_poll_must_check_waiting_for_buffers: Return %POLLERR at poll when QBUF
+ * @quirk_poll_must_check_waiting_for_buffers: Return %EPOLLERR at poll when QBUF
  *              has not been called. This is a vb1 idiom that has been adopted
  *              also by vb2.
  * @lock:	pointer to a mutex that protects the &struct vb2_queue. The
@@ -493,7 +493,7 @@ struct vb2_buf_ops {
  * @error:	a fatal error occurred on the queue
  * @waiting_for_buffers: used in poll() to check if vb2 is still waiting for
  *		buffers. Only set for capture queues if qbuf has not yet been
- *		called since poll() needs to return %POLLERR in that situation.
+ *		called since poll() needs to return %EPOLLERR in that situation.
  * @is_multiplanar: set if buffer type is multiplanar
  * @is_output:	set if buffer type is output
  * @copy_timestamp: set if vb2-core should set timestamps
@@ -869,7 +869,7 @@ void vb2_core_queue_release(struct vb2_queue *q);
  * @q:		pointer to &struct vb2_queue with videobuf2 queue.
  *
  * Flag that a fatal unrecoverable error has occurred and wake up all processes
- * waiting on the queue. Polling will now set %POLLERR and queuing and dequeuing
+ * waiting on the queue. Polling will now set %EPOLLERR and queuing and dequeuing
  * buffers will return %-EIO.
  *
  * The error flag will be cleared when canceling the queue, either from
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 6692d67e92450..c1a93ce35e623 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -310,7 +310,7 @@ void inet_csk_prepare_forced_close(struct sock *sk);
 static inline __poll_t inet_csk_listen_poll(const struct sock *sk)
 {
 	return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ?
-			(POLLIN | POLLRDNORM) : 0;
+			(EPOLLIN | EPOLLRDNORM) : 0;
 }
 
 int inet_csk_listen_start(struct sock *sk, int backlog);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 360e564ae7d1f..d7f309f74dec2 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -578,10 +578,10 @@ static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *po
 
 	spin_lock(&info->lock);
 	if (info->attr.mq_curmsgs)
-		retval = POLLIN | POLLRDNORM;
+		retval = EPOLLIN | EPOLLRDNORM;
 
 	if (info->attr.mq_curmsgs < info->attr.mq_maxmsg)
-		retval |= POLLOUT | POLLWRNORM;
+		retval |= EPOLLOUT | EPOLLWRNORM;
 	spin_unlock(&info->lock);
 
 	return retval;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f0549e79978b2..96db9ae5d5af7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4524,7 +4524,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_event *event = file->private_data;
 	struct ring_buffer *rb;
-	__poll_t events = POLLHUP;
+	__poll_t events = EPOLLHUP;
 
 	poll_wait(file, &event->waitq, wait);
 
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 141aa2ca87288..6c6b3c48db715 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,7 +19,7 @@
 
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
-	atomic_set(&handle->rb->poll, POLLIN);
+	atomic_set(&handle->rb->poll, EPOLLIN);
 
 	handle->event->pending_wakeup = 1;
 	irq_work_queue(&handle->event->pending);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index db4b9b8929ebf..fc1123583fa6e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -930,7 +930,7 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
 	__poll_t ret = 0;
 
 	if (!user)
-		return POLLERR|POLLNVAL;
+		return EPOLLERR|EPOLLNVAL;
 
 	poll_wait(file, &log_wait, wait);
 
@@ -938,9 +938,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
-			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
+			ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
 		else
-			ret = POLLIN|POLLRDNORM;
+			ret = EPOLLIN|EPOLLRDNORM;
 	}
 	logbuf_unlock_irq();
 
diff --git a/kernel/relay.c b/kernel/relay.c
index f7f40a6e63528..c3029402f15c3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -924,12 +924,12 @@ static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
 	struct rchan_buf *buf = filp->private_data;
 
 	if (buf->finalized)
-		return POLLERR;
+		return EPOLLERR;
 
 	if (filp->f_mode & FMODE_READ) {
 		poll_wait(filp, &buf->read_wait, wait);
 		if (!relay_buf_empty(buf))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	return mask;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 94ad46d50b56d..fe56c4e06c514 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -74,7 +74,7 @@ static __poll_t posix_clock_poll(struct file *fp, poll_table *wait)
 	__poll_t result = 0;
 
 	if (!clk)
-		return POLLERR;
+		return EPOLLERR;
 
 	if (clk->ops.poll)
 		result = clk->ops.poll(clk, fp, wait);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ca6930e0d25e9..dcf1c4dd3efe6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -627,7 +627,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
  * as data is added to any of the @buffer's cpu buffers. Otherwise
  * it will wait for data to be added to a specific cpu buffer.
  *
- * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
  * zero otherwise.
  */
 __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
@@ -665,7 +665,7 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
 
 	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
 	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 56608538a4ad6..20a2300ae4e86 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5623,13 +5623,13 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
 
 	/* Iterators are static, they should be filled or empty */
 	if (trace_buffer_iter(iter, iter->cpu_file))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	if (tr->trace_flags & TRACE_ITER_BLOCK)
 		/*
 		 * Always select as readable when in blocking mode
 		 */
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	else
 		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
 					     filp, poll_table);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13b35ffa021e3..670e99b68aa60 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3688,7 +3688,7 @@ static void memcg_event_remove(struct work_struct *work)
 }
 
 /*
- * Gets called on POLLHUP on eventfd when user closes it.
+ * Gets called on EPOLLHUP on eventfd when user closes it.
  *
  * Called with wqh->lock held and interrupts disabled.
  */
@@ -3700,7 +3700,7 @@ static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 	struct mem_cgroup *memcg = event->memcg;
 	__poll_t flags = key_to_poll(key);
 
-	if (flags & POLLHUP) {
+	if (flags & EPOLLHUP) {
 		/*
 		 * If the event has been detached at cgroup removal, we
 		 * can simply return knowing the other side will cleanup
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 42fe5653814ad..c7a33717d079b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2705,10 +2705,10 @@ static __poll_t swaps_poll(struct file *file, poll_table *wait)
 
 	if (seq->poll_event != atomic_read(&proc_poll_event)) {
 		seq->poll_event = atomic_read(&proc_poll_event);
-		return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+		return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
 	}
 
-	return POLLIN | POLLRDNORM;
+	return EPOLLIN | EPOLLRDNORM;
 }
 
 /* iterator */
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index d6f7f7cb79c45..0cfba919d167d 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -240,7 +240,7 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 	if (!ts) {
 		if (err)
 			*err = -EREMOTEIO;
-		return POLLERR;
+		return EPOLLERR;
 	}
 
 	if (!ts->rd->f_op->poll)
@@ -253,7 +253,7 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 			n = DEFAULT_POLLMASK;
 		else
 			n = ts->wr->f_op->poll(ts->wr, pt);
-		ret = (ret & ~POLLOUT) | (n & ~POLLIN);
+		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
 	}
 
 	return ret;
@@ -396,11 +396,11 @@ static void p9_read_work(struct work_struct *work)
 
 	if (!list_empty(&m->req_list)) {
 		if (test_and_clear_bit(Rpending, &m->wsched))
-			n = POLLIN;
+			n = EPOLLIN;
 		else
 			n = p9_fd_poll(m->client, NULL, NULL);
 
-		if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
+		if ((n & EPOLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) {
 			p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m);
 			schedule_work(&m->rq);
 		}
@@ -505,11 +505,11 @@ static void p9_write_work(struct work_struct *work)
 
 	if (m->wsize || !list_empty(&m->unsent_req_list)) {
 		if (test_and_clear_bit(Wpending, &m->wsched))
-			n = POLLOUT;
+			n = EPOLLOUT;
 		else
 			n = p9_fd_poll(m->client, NULL, NULL);
 
-		if ((n & POLLOUT) &&
+		if ((n & EPOLLOUT) &&
 		   !test_and_set_bit(Wworksched, &m->wsched)) {
 			p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m);
 			schedule_work(&m->wq);
@@ -599,12 +599,12 @@ static void p9_conn_create(struct p9_client *client)
 	init_poll_funcptr(&m->pt, p9_pollwait);
 
 	n = p9_fd_poll(client, &m->pt, NULL);
-	if (n & POLLIN) {
+	if (n & EPOLLIN) {
 		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
 		set_bit(Rpending, &m->wsched);
 	}
 
-	if (n & POLLOUT) {
+	if (n & EPOLLOUT) {
 		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
 		set_bit(Wpending, &m->wsched);
 	}
@@ -625,12 +625,12 @@ static void p9_poll_mux(struct p9_conn *m)
 		return;
 
 	n = p9_fd_poll(m->client, NULL, &err);
-	if (n & (POLLERR | POLLHUP | POLLNVAL)) {
+	if (n & (EPOLLERR | EPOLLHUP | EPOLLNVAL)) {
 		p9_debug(P9_DEBUG_TRANS, "error mux %p err %d\n", m, n);
 		p9_conn_cancel(m, err);
 	}
 
-	if (n & POLLIN) {
+	if (n & EPOLLIN) {
 		set_bit(Rpending, &m->wsched);
 		p9_debug(P9_DEBUG_TRANS, "mux %p can read\n", m);
 		if (!test_and_set_bit(Rworksched, &m->wsched)) {
@@ -639,7 +639,7 @@ static void p9_poll_mux(struct p9_conn *m)
 		}
 	}
 
-	if (n & POLLOUT) {
+	if (n & EPOLLOUT) {
 		set_bit(Wpending, &m->wsched);
 		p9_debug(P9_DEBUG_TRANS, "mux %p can write\n", m);
 		if ((m->wsize || !list_empty(&m->unsent_req_list)) &&
@@ -678,11 +678,11 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
 	spin_unlock(&client->lock);
 
 	if (test_and_clear_bit(Wpending, &m->wsched))
-		n = POLLOUT;
+		n = EPOLLOUT;
 	else
 		n = p9_fd_poll(m->client, NULL, NULL);
 
-	if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
+	if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
 		schedule_work(&m->wq);
 
 	return 0;
diff --git a/net/atm/common.c b/net/atm/common.c
index 6523f38c4957f..fc78a0508ae1b 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -661,15 +661,15 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	/* exceptional events */
 	if (sk->sk_err)
-		mask = POLLERR;
+		mask = EPOLLERR;
 
 	if (test_bit(ATM_VF_RELEASED, &vcc->flags) ||
 	    test_bit(ATM_VF_CLOSE, &vcc->flags))
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* writable? */
 	if (sock->state == SS_CONNECTING &&
@@ -678,7 +678,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	if (vcc->qos.txtp.traffic_class != ATM_NONE &&
 	    vcc_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	return mask;
 }
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index 581375d0eed26..e91f29c7c638a 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -304,7 +304,7 @@ static __poll_t batadv_socket_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &socket_client->queue_wait, wait);
 
 	if (socket_client->queue_len > 0)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 9be74a44e99d0..dc9fa37ddd141 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -193,7 +193,7 @@ static __poll_t batadv_log_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &debug_log->queue_wait, wait);
 
 	if (!batadv_log_empty(debug_log))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 
 	return 0;
 }
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index f897681780db7..84d92a0778343 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -431,7 +431,7 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
 		if (sk->sk_state == BT_CONNECTED ||
 		    (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) &&
 		     sk->sk_state == BT_CONNECT2))
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
@@ -451,20 +451,20 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 		return bt_accept_poll(sk);
 
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == BT_CLOSED)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (sk->sk_state == BT_CONNECT ||
 			sk->sk_state == BT_CONNECT2 ||
@@ -472,7 +472,7 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
 		return mask;
 
 	if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 	else
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index b109445a1df92..a6fb1b3bcad9b 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -924,7 +924,7 @@ static int caif_release(struct socket *sock)
 
 	caif_disconnect_client(sock_net(sk), &cf_sk->layer);
 	cf_sk->sk.sk_socket->state = SS_DISCONNECTING;
-	wake_up_interruptible_poll(sk_sleep(sk), POLLERR|POLLHUP);
+	wake_up_interruptible_poll(sk_sleep(sk), EPOLLERR|EPOLLHUP);
 
 	sock_orphan(sk);
 	sk_stream_kill_queues(&cf_sk->sk);
@@ -946,23 +946,23 @@ static __poll_t caif_poll(struct file *file,
 
 	/* exceptional events? */
 	if (sk->sk_err)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= EPOLLRDHUP;
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		(sk->sk_shutdown & RCV_SHUTDOWN))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/*
 	 * we set writable also when the other side has shut down the
 	 * connection. This prevents stuck sockets.
 	 */
 	if (sock_writeable(sk) && tx_flow_is_on(cf_sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	return mask;
 }
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b7d9293940b5a..9938952c5c78f 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -75,7 +75,7 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i
 	/*
 	 * Avoid a wakeup if event not interesting for us
 	 */
-	if (key && !(key_to_poll(key) & (POLLIN | POLLERR)))
+	if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, key);
 }
@@ -842,22 +842,22 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
 	if (connection_based(sk)) {
 		if (sk->sk_state == TCP_CLOSE)
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		/* connection hasn't started yet? */
 		if (sk->sk_state == TCP_SYN_SENT)
 			return mask;
@@ -865,7 +865,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
 
 	/* writable? */
 	if (sock_writeable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 	else
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index b026e1717df4e..c501499a04fe9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2619,7 +2619,7 @@ static void sock_def_error_report(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_poll(&wq->wait, POLLERR);
+		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
 	rcu_read_unlock();
 }
@@ -2631,8 +2631,8 @@ static void sock_def_readable(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
-						POLLRDNORM | POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+						EPOLLRDNORM | EPOLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
@@ -2649,8 +2649,8 @@ static void sock_def_write_space(struct sock *sk)
 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
 		wq = rcu_dereference(sk->sk_wq);
 		if (skwq_has_sleeper(wq))
-			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
-						POLLWRNORM | POLLWRBAND);
+			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+						EPOLLWRNORM | EPOLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
 		if (sock_writeable(sk))
diff --git a/net/core/stream.c b/net/core/stream.c
index 1cff9c6270c6b..7d329fb1f553a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -38,8 +38,8 @@ void sk_stream_write_space(struct sock *sk)
 		rcu_read_lock();
 		wq = rcu_dereference(sk->sk_wq);
 		if (skwq_has_sleeper(wq))
-			wake_up_interruptible_poll(&wq->wait, POLLOUT |
-						POLLWRNORM | POLLWRBAND);
+			wake_up_interruptible_poll(&wq->wait, EPOLLOUT |
+						EPOLLWRNORM | EPOLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		rcu_read_unlock();
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 74685fecfdb96..15bdc002d90c0 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -338,21 +338,21 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
 
 	mask = 0;
 	if (sk->sk_err)
-		mask = POLLERR;
+		mask = EPOLLERR;
 
 	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 
 	/* Connected? */
 	if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
 		if (atomic_read(&sk->sk_rmem_alloc) > 0)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 			if (sk_stream_is_writeable(sk)) {
-				mask |= POLLOUT | POLLWRNORM;
+				mask |= EPOLLOUT | EPOLLWRNORM;
 			} else {  /* send SIGIO later */
 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -362,7 +362,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
 				 * IO signal will be lost.
 				 */
 				if (sk_stream_is_writeable(sk))
-					mask |= POLLOUT | POLLWRNORM;
+					mask |= EPOLLOUT | EPOLLWRNORM;
 			}
 		}
 	}
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index cc1b505453a8a..91dd09f798089 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1216,7 +1216,7 @@ static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table  *wai
 	__poll_t mask = datagram_poll(file, sock, wait);
 
 	if (!skb_queue_empty(&scp->other_receive_queue))
-		mask |= POLLRDBAND;
+		mask |= EPOLLRDBAND;
 
 	return mask;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c24008daa3d87..e4329e1619436 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -828,7 +828,7 @@ int inet_shutdown(struct socket *sock, int how)
 	case TCP_CLOSE:
 		err = -ENOTCONN;
 		/* Hack to wake up other listeners, who can poll for
-		   POLLHUP, even on eg. unconnected UDP sockets -- RR */
+		   EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
 		/* fall through */
 	default:
 		sk->sk_shutdown |= how;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c059aa7df0a94..48636aee23c31 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -512,36 +512,36 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	mask = 0;
 
 	/*
-	 * POLLHUP is certainly not done right. But poll() doesn't
+	 * EPOLLHUP is certainly not done right. But poll() doesn't
 	 * have a notion of HUP in just one direction, and for a
 	 * socket the read side is more interesting.
 	 *
-	 * Some poll() documentation says that POLLHUP is incompatible
-	 * with the POLLOUT/POLLWR flags, so somebody should check this
+	 * Some poll() documentation says that EPOLLHUP is incompatible
+	 * with the EPOLLOUT/POLLWR flags, so somebody should check this
 	 * all. But careful, it tends to be safer to return too many
 	 * bits than too few, and you can easily break real applications
 	 * if you don't tell them that something has hung up!
 	 *
 	 * Check-me.
 	 *
-	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
 	 * our fs/select.c). It means that after we received EOF,
 	 * poll always returns immediately, making impossible poll() on write()
-	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+	 * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
 	 * if and only if shutdown has been made in both directions.
 	 * Actually, it is interesting to look how Solaris and DUX
-	 * solve this dilemma. I would prefer, if POLLHUP were maskable,
+	 * solve this dilemma. I would prefer, if EPOLLHUP were maskable,
 	 * then we could set it on SND_SHUTDOWN. BTW examples given
 	 * in Stevens' books assume exactly this behaviour, it explains
-	 * why POLLHUP is incompatible with POLLOUT.	--ANK
+	 * why EPOLLHUP is incompatible with EPOLLOUT.	--ANK
 	 *
 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 	 * blocking on fresh not-connected or disconnected socket. --ANK
 	 */
 	if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 
 	/* Connected or passive Fast Open socket? */
 	if (state != TCP_SYN_SENT &&
@@ -554,11 +554,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 			target++;
 
 		if (tp->rcv_nxt - tp->copied_seq >= target)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 			if (sk_stream_is_writeable(sk)) {
-				mask |= POLLOUT | POLLWRNORM;
+				mask |= EPOLLOUT | EPOLLWRNORM;
 			} else {  /* send SIGIO later */
 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -570,24 +570,24 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 				 */
 				smp_mb__after_atomic();
 				if (sk_stream_is_writeable(sk))
-					mask |= POLLOUT | POLLWRNORM;
+					mask |= EPOLLOUT | EPOLLWRNORM;
 			}
 		} else
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 
 		if (tp->urg_data & TCP_URG_VALID)
-			mask |= POLLPRI;
+			mask |= EPOLLPRI;
 	} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
 		/* Active TCP fastopen socket with defer_connect
-		 * Return POLLOUT so application can call write()
+		 * Return EPOLLOUT so application can call write()
 		 * in order for kernel to generate SYN+data
 		 */
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 
 	return mask;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cfa51cfd2d999..575d3c1fb6e83 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -315,7 +315,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
 
 	/* Fast Recovery (RFC 5681 3.2) :
 	 * Cubic needs 1.7 factor, rounded to 2 to include
-	 * extra cushion (application might react slowly to POLLOUT)
+	 * extra cushion (application might react slowly to EPOLLOUT)
 	 */
 	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
 	sndmem *= nr_segs * per_mss;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f81f969f9c065..bfaefe560b5ce 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2501,12 +2501,12 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	struct sock *sk = sock->sk;
 
 	if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Check for false positives due to checksum errors */
-	if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
+	if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
 	    !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
-		mask &= ~(POLLIN | POLLRDNORM);
+		mask &= ~(EPOLLIN | EPOLLRDNORM);
 
 	return mask;
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 64331158d6938..1e8cc7bcbca3a 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1483,7 +1483,7 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
 		sk = (struct sock *) isk;
 
 		if (sk->sk_state == IUCV_CONNECTED)
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
@@ -1501,27 +1501,27 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
 		return iucv_accept_poll(sk);
 
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= EPOLLRDHUP;
 
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (!skb_queue_empty(&sk->sk_receive_queue) ||
 	    (sk->sk_shutdown & RCV_SHUTDOWN))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == IUCV_CLOSED)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (sk->sk_state == IUCV_DISCONN)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	if (sock_writeable(sk) && iucv_below_msglim(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 	else
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 4a8d407f89022..f297d53a11aa0 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -396,8 +396,8 @@ static int kcm_read_sock_done(struct strparser *strp, int err)
 
 static void psock_state_change(struct sock *sk)
 {
-	/* TCP only does a POLLIN for a half close. Do a POLLHUP here
-	 * since application will normally not poll with POLLIN
+	/* TCP only does a EPOLLIN for a half close. Do a EPOLLHUP here
+	 * since application will normally not poll with EPOLLIN
 	 * on the TCP sockets.
 	 */
 
@@ -1338,7 +1338,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
 
 	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
 	 * we set sk_state, otherwise epoll_wait always returns right away with
-	 * POLLHUP
+	 * EPOLLHUP
 	 */
 	kcm->sk.sk_state = TCP_ESTABLISHED;
 
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 985909f105eb7..376040092142d 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -543,7 +543,7 @@ static inline __poll_t llcp_accept_poll(struct sock *parent)
 		sk = &llcp_sock->sk;
 
 		if (sk->sk_state == LLCP_CONNECTED)
-			return POLLIN | POLLRDNORM;
+			return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
@@ -563,23 +563,23 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
 		return llcp_accept_poll(sk);
 
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_state == LLCP_CLOSED)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 	else
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1d1483007e461..e0f3f4aeeb4fb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4085,7 +4085,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
 	if (po->rx_ring.pg_vec) {
 		if (!packet_previous_rx_frame(po, &po->rx_ring,
 			TP_STATUS_KERNEL))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
 		po->pressure = 0;
@@ -4093,7 +4093,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
 	spin_lock_bh(&sk->sk_write_queue.lock);
 	if (po->tx_ring.pg_vec) {
 		if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	spin_unlock_bh(&sk->sk_write_queue.lock);
 	return mask;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 08f6751d2030c..fffcd69f63ff4 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -351,18 +351,18 @@ static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
 	poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_state == TCP_CLOSE)
-		return POLLERR;
+		return EPOLLERR;
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (!skb_queue_empty(&pn->ctrlreq_queue))
-		mask |= POLLPRI;
+		mask |= EPOLLPRI;
 	if (!mask && sk->sk_state == TCP_CLOSE_WAIT)
-		return POLLHUP;
+		return EPOLLHUP;
 
 	if (sk->sk_state == TCP_ESTABLISHED &&
 		refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf &&
 		atomic_read(&pn->tx_credits))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	return mask;
 }
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 88aa8ad0f5b67..744c637c86b01 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -137,17 +137,17 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 
 /*
  * RDS' poll is without a doubt the least intuitive part of the interface,
- * as POLLIN and POLLOUT do not behave entirely as you would expect from
+ * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
  * a network protocol.
  *
- * POLLIN is asserted if
+ * EPOLLIN is asserted if
  *  -	there is data on the receive queue.
  *  -	to signal that a previously congested destination may have become
  *	uncongested
  *  -	A notification has been queued to the socket (this can be a congestion
  *	update, or a RDMA completion).
  *
- * POLLOUT is asserted if there is room on the send queue. This does not mean
+ * EPOLLOUT is asserted if there is room on the send queue. This does not mean
  * however, that the next sendmsg() call will succeed. If the application tries
  * to send to a congested destination, the system call may still fail (and
  * return ENOBUFS).
@@ -167,22 +167,22 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 
 	read_lock_irqsave(&rs->rs_recv_lock, flags);
 	if (!rs->rs_cong_monitor) {
-		/* When a congestion map was updated, we signal POLLIN for
+		/* When a congestion map was updated, we signal EPOLLIN for
 		 * "historical" reasons. Applications can also poll for
 		 * WRBAND instead. */
 		if (rds_cong_updated_since(&rs->rs_cong_track))
-			mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
+			mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
 	} else {
 		spin_lock(&rs->rs_lock);
 		if (rs->rs_cong_notify)
-			mask |= (POLLIN | POLLRDNORM);
+			mask |= (EPOLLIN | EPOLLRDNORM);
 		spin_unlock(&rs->rs_lock);
 	}
 	if (!list_empty(&rs->rs_recv_queue) ||
 	    !list_empty(&rs->rs_notify_queue))
-		mask |= (POLLIN | POLLRDNORM);
+		mask |= (EPOLLIN | EPOLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
-		mask |= (POLLOUT | POLLWRNORM);
+		mask |= (EPOLLOUT | EPOLLWRNORM);
 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
 	/* clear state any time we wake a seen-congested socket */
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 124c77e9d0582..59d0eb9602758 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1142,13 +1142,13 @@ static int rfkill_fop_open(struct inode *inode, struct file *file)
 static __poll_t rfkill_fop_poll(struct file *file, poll_table *wait)
 {
 	struct rfkill_data *data = file->private_data;
-	__poll_t res = POLLOUT | POLLWRNORM;
+	__poll_t res = EPOLLOUT | EPOLLWRNORM;
 
 	poll_wait(file, &data->read_wait, wait);
 
 	mutex_lock(&data->mtx);
 	if (!list_empty(&data->events))
-		res = POLLIN | POLLRDNORM;
+		res = EPOLLIN | EPOLLRDNORM;
 	mutex_unlock(&data->mtx);
 
 	return res;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 21ad6a3a465ce..0c9c18aa7c77b 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -742,13 +742,13 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
 	/* the socket is readable if there are any messages waiting on the Rx
 	 * queue */
 	if (!list_empty(&rx->recvmsg_q))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* the socket is writable if there is space to add new data to the
 	 * socket; there is no guarantee that any particular call in progress
 	 * on the socket may have space in the Tx ACK window */
 	if (rxrpc_writable(sk))
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	return mask;
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ebb8cb9eb0bd8..bf271f8c2dc9b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7602,22 +7602,22 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	 */
 	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
 		return (!list_empty(&sp->ep->asocs)) ?
-			(POLLIN | POLLRDNORM) : 0;
+			(EPOLLIN | EPOLLRDNORM) : 0;
 
 	mask = 0;
 
 	/* Is there any exceptional events?  */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	/* Is it readable?  Reconsider this code with TCP-style support.  */
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* The association is either gone or not ready.  */
 	if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED))
@@ -7625,7 +7625,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 	/* Is it writable?  */
 	if (sctp_writeable(sk)) {
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	} else {
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 		/*
@@ -7637,7 +7637,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		 * in the following code to cover it as well.
 		 */
 		if (sctp_writeable(sk))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	return mask;
 }
@@ -8161,8 +8161,8 @@ void sctp_data_ready(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
-						POLLRDNORM | POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
+						EPOLLRDNORM | EPOLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ba4b84debc5a0..da1a5cdefd13e 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1145,7 +1145,7 @@ static __poll_t smc_accept_poll(struct sock *parent)
 
 	spin_lock(&isk->accept_q_lock);
 	if (!list_empty(&isk->accept_q))
-		mask = POLLIN | POLLRDNORM;
+		mask = EPOLLIN | EPOLLRDNORM;
 	spin_unlock(&isk->accept_q_lock);
 
 	return mask;
@@ -1160,7 +1160,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 	int rc;
 
 	if (!sk)
-		return POLLNVAL;
+		return EPOLLNVAL;
 
 	smc = smc_sk(sock->sk);
 	sock_hold(sk);
@@ -1171,16 +1171,16 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
 		/* if non-blocking connect finished ... */
 		lock_sock(sk);
-		if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+		if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
 			sk->sk_err = smc->clcsock->sk->sk_err;
 			if (sk->sk_err) {
-				mask |= POLLERR;
+				mask |= EPOLLERR;
 			} else {
 				rc = smc_connect_rdma(smc);
 				if (rc < 0)
-					mask |= POLLERR;
+					mask |= EPOLLERR;
 				/* success cases including fallback */
-				mask |= POLLOUT | POLLWRNORM;
+				mask |= EPOLLOUT | EPOLLWRNORM;
 			}
 		}
 	} else {
@@ -1190,27 +1190,27 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
 			lock_sock(sk);
 		}
 		if (sk->sk_err)
-			mask |= POLLERR;
+			mask |= EPOLLERR;
 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 		    (sk->sk_state == SMC_CLOSED))
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		if (sk->sk_state == SMC_LISTEN) {
 			/* woken up by sk_data_ready in smc_listen_work() */
 			mask = smc_accept_poll(sk);
 		} else {
 			if (atomic_read(&smc->conn.sndbuf_space) ||
 			    sk->sk_shutdown & SEND_SHUTDOWN) {
-				mask |= POLLOUT | POLLWRNORM;
+				mask |= EPOLLOUT | EPOLLWRNORM;
 			} else {
 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			}
 			if (atomic_read(&smc->conn.bytes_to_rcv))
-				mask |= POLLIN | POLLRDNORM;
+				mask |= EPOLLIN | EPOLLRDNORM;
 			if (sk->sk_shutdown & RCV_SHUTDOWN)
-				mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
-				mask |= POLLIN;
+				mask |= EPOLLIN;
 		}
 
 	}
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 9dc392ca06bfb..eff4e0d0bb310 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -35,8 +35,8 @@ static void smc_rx_data_ready(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
-						POLLRDNORM | POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
+						EPOLLRDNORM | EPOLLRDBAND);
 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
 	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 	    (sk->sk_state == SMC_CLOSED))
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 838bce20c3610..72f004c9c9b13 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -46,8 +46,8 @@ static void smc_tx_write_space(struct sock *sk)
 		wq = rcu_dereference(sk->sk_wq);
 		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_poll(&wq->wait,
-						   POLLOUT | POLLWRNORM |
-						   POLLWRBAND);
+						   EPOLLOUT | EPOLLWRNORM |
+						   EPOLLWRBAND);
 		if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 			sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
 		rcu_read_unlock();
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index aa36dad32db16..8a7e1c774f9c5 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -940,7 +940,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait,
 	poll_wait(filp, &queue_wait, wait);
 
 	/* alway allow write */
-	mask = POLLOUT | POLLWRNORM;
+	mask = EPOLLOUT | EPOLLWRNORM;
 
 	if (!rp)
 		return mask;
@@ -950,7 +950,7 @@ static __poll_t cache_poll(struct file *filp, poll_table *wait,
 	for (cq= &rp->q; &cq->list != &cd->queue;
 	     cq = list_entry(cq->list.next, struct cache_queue, list))
 		if (!cq->reader) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 			break;
 		}
 	spin_unlock(&queue_lock);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5c43303257873..fc97fc3ed637a 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -345,15 +345,15 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
 {
 	struct inode *inode = file_inode(filp);
 	struct rpc_inode *rpci = RPC_I(inode);
-	__poll_t mask = POLLOUT | POLLWRNORM;
+	__poll_t mask = EPOLLOUT | EPOLLWRNORM;
 
 	poll_wait(filp, &rpci->waitq, wait);
 
 	inode_lock(inode);
 	if (rpci->pipe == NULL)
-		mask |= POLLERR | POLLHUP;
+		mask |= EPOLLERR | EPOLLHUP;
 	else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	inode_unlock(inode);
 	return mask;
 }
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 163f3a5475014..b0323ec7971ed 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -721,31 +721,31 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
 	sock_poll_wait(file, sk_sleep(sk), wait);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		revents |= POLLRDHUP | POLLIN | POLLRDNORM;
+		revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		revents |= POLLHUP;
+		revents |= EPOLLHUP;
 
 	switch (sk->sk_state) {
 	case TIPC_ESTABLISHED:
 	case TIPC_CONNECTING:
 		if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
-			revents |= POLLOUT;
+			revents |= EPOLLOUT;
 		/* fall thru' */
 	case TIPC_LISTEN:
 		if (!skb_queue_empty(&sk->sk_receive_queue))
-			revents |= POLLIN | POLLRDNORM;
+			revents |= EPOLLIN | EPOLLRDNORM;
 		break;
 	case TIPC_OPEN:
 		if (tsk->group_is_open && !tsk->cong_link_cnt)
-			revents |= POLLOUT;
+			revents |= EPOLLOUT;
 		if (!tipc_sk_type_connectionless(sk))
 			break;
 		if (skb_queue_empty(&sk->sk_receive_queue))
 			break;
-		revents |= POLLIN | POLLRDNORM;
+		revents |= EPOLLIN | EPOLLRDNORM;
 		break;
 	case TIPC_DISCONNECTING:
-		revents = POLLIN | POLLRDNORM | POLLHUP;
+		revents = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
 		break;
 	}
 	return revents;
@@ -1897,8 +1897,8 @@ static void tipc_write_space(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
-						POLLWRNORM | POLLWRBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+						EPOLLWRNORM | EPOLLWRBAND);
 	rcu_read_unlock();
 }
 
@@ -1914,8 +1914,8 @@ static void tipc_data_ready(struct sock *sk)
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
-						POLLRDNORM | POLLRDBAND);
+		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
+						EPOLLRDNORM | EPOLLRDBAND);
 	rcu_read_unlock();
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0214acbd6bffb..d545e1d0dea22 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -415,9 +415,9 @@ static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
 {
 	unix_dgram_peer_wake_disconnect(sk, other);
 	wake_up_interruptible_poll(sk_sleep(sk),
-				   POLLOUT |
-				   POLLWRNORM |
-				   POLLWRBAND);
+				   EPOLLOUT |
+				   EPOLLWRNORM |
+				   EPOLLWRBAND);
 }
 
 /* preconditions:
@@ -454,7 +454,7 @@ static void unix_write_space(struct sock *sk)
 		wq = rcu_dereference(sk->sk_wq);
 		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
-				POLLOUT | POLLWRNORM | POLLWRBAND);
+				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
@@ -2129,8 +2129,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 
 	if (wq_has_sleeper(&u->peer_wait))
 		wake_up_interruptible_sync_poll(&u->peer_wait,
-						POLLOUT | POLLWRNORM |
-						POLLWRBAND);
+						EPOLLOUT | EPOLLWRNORM |
+						EPOLLWRBAND);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
@@ -2650,27 +2650,27 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
 
 	/* exceptional events? */
 	if (sk->sk_err)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
 	    sk->sk_state == TCP_CLOSE)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	/*
 	 * we set writable also when the other side has shut down the
 	 * connection. This prevents stuck sockets.
 	 */
 	if (unix_writable(sk))
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	return mask;
 }
@@ -2687,29 +2687,29 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 
 	/* exceptional events? */
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
-		mask |= POLLERR |
-			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
+		mask |= EPOLLERR |
+			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
+		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 
 	/* readable? */
 	if (!skb_queue_empty(&sk->sk_receive_queue))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
 	if (sk->sk_type == SOCK_SEQPACKET) {
 		if (sk->sk_state == TCP_CLOSE)
-			mask |= POLLHUP;
+			mask |= EPOLLHUP;
 		/* connection hasn't started yet? */
 		if (sk->sk_state == TCP_SYN_SENT)
 			return mask;
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
+	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
@@ -2726,7 +2726,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
 	}
 
 	if (writable)
-		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 	else
 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9d95e773f4c8c..e0fc84daed944 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -865,20 +865,20 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 
 	if (sk->sk_err)
 		/* Signify that there has been an error on this socket. */
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 
 	/* INET sockets treat local write shutdown and peer write shutdown as a
-	 * case of POLLHUP set.
+	 * case of EPOLLHUP set.
 	 */
 	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 	    ((sk->sk_shutdown & SEND_SHUTDOWN) &&
 	     (vsk->peer_shutdown & SEND_SHUTDOWN))) {
-		mask |= POLLHUP;
+		mask |= EPOLLHUP;
 	}
 
 	if (sk->sk_shutdown & RCV_SHUTDOWN ||
 	    vsk->peer_shutdown & SEND_SHUTDOWN) {
-		mask |= POLLRDHUP;
+		mask |= EPOLLRDHUP;
 	}
 
 	if (sock->type == SOCK_DGRAM) {
@@ -888,11 +888,11 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 		 */
 		if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN)) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		}
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN))
-			mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+			mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
 
 	} else if (sock->type == SOCK_STREAM) {
 		lock_sock(sk);
@@ -902,7 +902,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 		 */
 		if (sk->sk_state == TCP_LISTEN
 		    && !vsock_is_accept_queue_empty(sk))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 
 		/* If there is something in the queue then we can read. */
 		if (transport->stream_is_active(vsk) &&
@@ -911,10 +911,10 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 			int ret = transport->notify_poll_in(
 					vsk, 1, &data_ready_now);
 			if (ret < 0) {
-				mask |= POLLERR;
+				mask |= EPOLLERR;
 			} else {
 				if (data_ready_now)
-					mask |= POLLIN | POLLRDNORM;
+					mask |= EPOLLIN | EPOLLRDNORM;
 
 			}
 		}
@@ -925,7 +925,7 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 		 */
 		if (sk->sk_shutdown & RCV_SHUTDOWN ||
 		    vsk->peer_shutdown & SEND_SHUTDOWN) {
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		}
 
 		/* Connected sockets that can produce data can be written. */
@@ -935,25 +935,25 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 				int ret = transport->notify_poll_out(
 						vsk, 1, &space_avail_now);
 				if (ret < 0) {
-					mask |= POLLERR;
+					mask |= EPOLLERR;
 				} else {
 					if (space_avail_now)
-						/* Remove POLLWRBAND since INET
+						/* Remove EPOLLWRBAND since INET
 						 * sockets are not setting it.
 						 */
-						mask |= POLLOUT | POLLWRNORM;
+						mask |= EPOLLOUT | EPOLLWRNORM;
 
 				}
 			}
 		}
 
 		/* Simulate INET socket poll behaviors, which sets
-		 * POLLOUT|POLLWRNORM when peer is closed and nothing to read,
+		 * EPOLLOUT|EPOLLWRNORM when peer is closed and nothing to read,
 		 * but local send is not shutdown.
 		 */
 		if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_CLOSING) {
 			if (!(sk->sk_shutdown & SEND_SHUTDOWN))
-				mask |= POLLOUT | POLLWRNORM;
+				mask |= EPOLLOUT | EPOLLWRNORM;
 
 		}
 
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 4d202b73a0e11..a9428daa69f30 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -580,7 +580,7 @@ static __poll_t ns_revision_poll(struct file *file, poll_table *pt)
 		mutex_lock_nested(&rev->ns->lock, rev->ns->level);
 		poll_wait(file, &rev->ns->wait, pt);
 		if (rev->last_read < rev->ns->revision)
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		mutex_unlock(&rev->ns->lock);
 	}
 
diff --git a/security/tomoyo/audit.c b/security/tomoyo/audit.c
index 558e3076d38c1..479b03a7a17ef 100644
--- a/security/tomoyo/audit.c
+++ b/security/tomoyo/audit.c
@@ -456,14 +456,14 @@ void tomoyo_read_log(struct tomoyo_io_buffer *head)
  * @file: Pointer to "struct file".
  * @wait: Pointer to "poll_table". Maybe NULL.
  *
- * Returns POLLIN | POLLRDNORM when ready to read an audit log.
+ * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log.
  */
 __poll_t tomoyo_poll_log(struct file *file, poll_table *wait)
 {
 	if (tomoyo_log_count)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	poll_wait(file, &tomoyo_log_wait, wait);
 	if (tomoyo_log_count)
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 70c73bf66c88f..03923a138ef54 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -2116,17 +2116,17 @@ static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
  * @file: Pointer to "struct file".
  * @wait: Pointer to "poll_table".
  *
- * Returns POLLIN | POLLRDNORM when ready to read, 0 otherwise.
+ * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise.
  *
  * Waits for access requests which violated policy in enforcing mode.
  */
 static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait)
 {
 	if (!list_empty(&tomoyo_query_list))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	poll_wait(file, &tomoyo_query_wait, wait);
 	if (!list_empty(&tomoyo_query_list))
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 	return 0;
 }
 
@@ -2450,15 +2450,15 @@ int tomoyo_open_control(const u8 type, struct file *file)
  * @file: Pointer to "struct file".
  * @wait: Pointer to "poll_table". Maybe NULL.
  *
- * Returns POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM if ready to read/write,
- * POLLOUT | POLLWRNORM otherwise.
+ * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
+ * EPOLLOUT | EPOLLWRNORM otherwise.
  */
 __poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
 {
 	struct tomoyo_io_buffer *head = file->private_data;
 	if (head->poll)
-		return head->poll(file, wait) | POLLOUT | POLLWRNORM;
-	return POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM;
+		return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
+	return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
 }
 
 /**
diff --git a/security/tomoyo/securityfs_if.c b/security/tomoyo/securityfs_if.c
index fb9bf99deb35d..1d3d7e7a1f055 100644
--- a/security/tomoyo/securityfs_if.c
+++ b/security/tomoyo/securityfs_if.c
@@ -154,8 +154,8 @@ static int tomoyo_release(struct inode *inode, struct file *file)
  * @file: Pointer to "struct file".
  * @wait: Pointer to "poll_table". Maybe NULL.
  *
- * Returns POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM if ready to read/write,
- * POLLOUT | POLLWRNORM otherwise.
+ * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
+ * EPOLLOUT | EPOLLWRNORM otherwise.
  */
 static __poll_t tomoyo_poll(struct file *file, poll_table *wait)
 {
diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c
index a12b9555e9107..4563432badba5 100644
--- a/sound/core/compress_offload.c
+++ b/sound/core/compress_offload.c
@@ -399,9 +399,9 @@ static int snd_compr_mmap(struct file *f, struct vm_area_struct *vma)
 static __poll_t snd_compr_get_poll(struct snd_compr_stream *stream)
 {
 	if (stream->direction == SND_COMPRESS_PLAYBACK)
-		return POLLOUT | POLLWRNORM;
+		return EPOLLOUT | EPOLLWRNORM;
 	else
-		return POLLIN | POLLRDNORM;
+		return EPOLLIN | EPOLLRDNORM;
 }
 
 static __poll_t snd_compr_poll(struct file *f, poll_table *wait)
@@ -412,7 +412,7 @@ static __poll_t snd_compr_poll(struct file *f, poll_table *wait)
 	__poll_t retval = 0;
 
 	if (snd_BUG_ON(!data))
-		return POLLERR;
+		return EPOLLERR;
 
 	stream = &data->stream;
 
@@ -421,7 +421,7 @@ static __poll_t snd_compr_poll(struct file *f, poll_table *wait)
 	switch (stream->runtime->state) {
 	case SNDRV_PCM_STATE_OPEN:
 	case SNDRV_PCM_STATE_XRUN:
-		retval = snd_compr_get_poll(stream) | POLLERR;
+		retval = snd_compr_get_poll(stream) | EPOLLERR;
 		goto out;
 	default:
 		break;
@@ -447,7 +447,7 @@ static __poll_t snd_compr_poll(struct file *f, poll_table *wait)
 			retval = snd_compr_get_poll(stream);
 		break;
 	default:
-		retval = snd_compr_get_poll(stream) | POLLERR;
+		retval = snd_compr_get_poll(stream) | EPOLLERR;
 		break;
 	}
 out:
diff --git a/sound/core/control.c b/sound/core/control.c
index 50fa16022f1fe..0b3026d937b10 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1679,7 +1679,7 @@ static __poll_t snd_ctl_poll(struct file *file, poll_table * wait)
 
 	mask = 0;
 	if (!list_empty(&ctl->events))
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
 }
diff --git a/sound/core/info.c b/sound/core/info.c
index aa86f3f8e0565..4b36767af9e15 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -214,9 +214,9 @@ static __poll_t snd_info_entry_poll(struct file *file, poll_table *wait)
 					  data->file_private_data,
 					  file, wait);
 	if (entry->c.ops->read)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (entry->c.ops->write)
-		mask |= POLLOUT | POLLWRNORM;
+		mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 }
 
diff --git a/sound/core/init.c b/sound/core/init.c
index 8753440c3a6ea..4fa5dd955740a 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -346,7 +346,7 @@ static int snd_disconnect_release(struct inode *inode, struct file *file)
 
 static __poll_t snd_disconnect_poll(struct file * file, poll_table * wait)
 {
-	return POLLERR | POLLNVAL;
+	return EPOLLERR | EPOLLNVAL;
 }
 
 static long snd_disconnect_ioctl(struct file *file,
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 3ebba9c7f86e0..b044c0a5a674b 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -2705,7 +2705,7 @@ static __poll_t snd_pcm_oss_poll(struct file *file, poll_table * wait)
 		if (runtime->status->state != SNDRV_PCM_STATE_DRAINING &&
 		    (runtime->status->state != SNDRV_PCM_STATE_RUNNING ||
 		     snd_pcm_oss_playback_ready(psubstream)))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 		snd_pcm_stream_unlock_irq(psubstream);
 	}
 	if (csubstream != NULL) {
@@ -2715,7 +2715,7 @@ static __poll_t snd_pcm_oss_poll(struct file *file, poll_table * wait)
 		snd_pcm_stream_lock_irq(csubstream);
 		if ((ostate = runtime->status->state) != SNDRV_PCM_STATE_RUNNING ||
 		    snd_pcm_oss_capture_ready(csubstream))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 		snd_pcm_stream_unlock_irq(csubstream);
 		if (ostate != SNDRV_PCM_STATE_RUNNING && runtime->oss.trigger) {
 			struct snd_pcm_oss_file ofile;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 51104df924e10..77ba50ddcf9e3 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -3147,7 +3147,7 @@ static __poll_t snd_pcm_playback_poll(struct file *file, poll_table * wait)
 
 	substream = pcm_file->substream;
 	if (PCM_RUNTIME_CHECK(substream))
-		return POLLOUT | POLLWRNORM | POLLERR;
+		return EPOLLOUT | EPOLLWRNORM | EPOLLERR;
 	runtime = substream->runtime;
 
 	poll_wait(file, &runtime->sleep, wait);
@@ -3159,7 +3159,7 @@ static __poll_t snd_pcm_playback_poll(struct file *file, poll_table * wait)
 	case SNDRV_PCM_STATE_PREPARED:
 	case SNDRV_PCM_STATE_PAUSED:
 		if (avail >= runtime->control->avail_min) {
-			mask = POLLOUT | POLLWRNORM;
+			mask = EPOLLOUT | EPOLLWRNORM;
 			break;
 		}
 		/* Fall through */
@@ -3167,7 +3167,7 @@ static __poll_t snd_pcm_playback_poll(struct file *file, poll_table * wait)
 		mask = 0;
 		break;
 	default:
-		mask = POLLOUT | POLLWRNORM | POLLERR;
+		mask = EPOLLOUT | EPOLLWRNORM | EPOLLERR;
 		break;
 	}
 	snd_pcm_stream_unlock_irq(substream);
@@ -3186,7 +3186,7 @@ static __poll_t snd_pcm_capture_poll(struct file *file, poll_table * wait)
 
 	substream = pcm_file->substream;
 	if (PCM_RUNTIME_CHECK(substream))
-		return POLLIN | POLLRDNORM | POLLERR;
+		return EPOLLIN | EPOLLRDNORM | EPOLLERR;
 	runtime = substream->runtime;
 
 	poll_wait(file, &runtime->sleep, wait);
@@ -3198,19 +3198,19 @@ static __poll_t snd_pcm_capture_poll(struct file *file, poll_table * wait)
 	case SNDRV_PCM_STATE_PREPARED:
 	case SNDRV_PCM_STATE_PAUSED:
 		if (avail >= runtime->control->avail_min) {
-			mask = POLLIN | POLLRDNORM;
+			mask = EPOLLIN | EPOLLRDNORM;
 			break;
 		}
 		mask = 0;
 		break;
 	case SNDRV_PCM_STATE_DRAINING:
 		if (avail > 0) {
-			mask = POLLIN | POLLRDNORM;
+			mask = EPOLLIN | EPOLLRDNORM;
 			break;
 		}
 		/* Fall through */
 	default:
-		mask = POLLIN | POLLRDNORM | POLLERR;
+		mask = EPOLLIN | EPOLLRDNORM | EPOLLERR;
 		break;
 	}
 	snd_pcm_stream_unlock_irq(substream);
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index fae21311723f1..69616d00481c2 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -1385,11 +1385,11 @@ static __poll_t snd_rawmidi_poll(struct file *file, poll_table * wait)
 	mask = 0;
 	if (rfile->input != NULL) {
 		if (snd_rawmidi_ready(rfile->input))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 	if (rfile->output != NULL) {
 		if (snd_rawmidi_ready(rfile->output))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	return mask;
 }
diff --git a/sound/core/seq/oss/seq_oss_rw.c b/sound/core/seq/oss/seq_oss_rw.c
index c538e78ca3102..30886f5fb1001 100644
--- a/sound/core/seq/oss/seq_oss_rw.c
+++ b/sound/core/seq/oss/seq_oss_rw.c
@@ -204,13 +204,13 @@ snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wai
 	/* input */
 	if (dp->readq && is_read_mode(dp->file_mode)) {
 		if (snd_seq_oss_readq_poll(dp->readq, file, wait))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	/* output */
 	if (dp->writeq && is_write_mode(dp->file_mode)) {
 		if (snd_seq_kernel_client_write_poll(dp->cseq, file, wait))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 	return mask;
 }
diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index b611deef81f5b..60db32785f622 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -1101,7 +1101,7 @@ static __poll_t snd_seq_poll(struct file *file, poll_table * wait)
 
 		/* check if data is available in the outqueue */
 		if (snd_seq_fifo_poll_wait(client->data.user.fifo, file, wait))
-			mask |= POLLIN | POLLRDNORM;
+			mask |= EPOLLIN | EPOLLRDNORM;
 	}
 
 	if (snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT) {
@@ -1109,7 +1109,7 @@ static __poll_t snd_seq_poll(struct file *file, poll_table * wait)
 		/* check if data is available in the pool */
 		if (!snd_seq_write_pool_allocated(client) ||
 		    snd_seq_pool_poll_wait(client->pool, file, wait))
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	}
 
 	return mask;
diff --git a/sound/core/timer.c b/sound/core/timer.c
index da05e314917f9..dc87728c5b745 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -2084,9 +2084,9 @@ static __poll_t snd_timer_user_poll(struct file *file, poll_table * wait)
 	mask = 0;
 	spin_lock_irq(&tu->qlock);
 	if (tu->qused)
-		mask |= POLLIN | POLLRDNORM;
+		mask |= EPOLLIN | EPOLLRDNORM;
 	if (tu->disconnected)
-		mask |= POLLERR;
+		mask |= EPOLLERR;
 	spin_unlock_irq(&tu->qlock);
 
 	return mask;
diff --git a/sound/firewire/bebob/bebob_hwdep.c b/sound/firewire/bebob/bebob_hwdep.c
index 83e791810c52e..04c321e08c626 100644
--- a/sound/firewire/bebob/bebob_hwdep.c
+++ b/sound/firewire/bebob/bebob_hwdep.c
@@ -63,7 +63,7 @@ hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait)
 
 	spin_lock_irq(&bebob->lock);
 	if (bebob->dev_lock_changed)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&bebob->lock);
diff --git a/sound/firewire/dice/dice-hwdep.c b/sound/firewire/dice/dice-hwdep.c
index 7a8af0f91c96e..6498bf6909ba9 100644
--- a/sound/firewire/dice/dice-hwdep.c
+++ b/sound/firewire/dice/dice-hwdep.c
@@ -62,7 +62,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&dice->lock);
 	if (dice->dev_lock_changed || dice->notification_bits != 0)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&dice->lock);
diff --git a/sound/firewire/digi00x/digi00x-hwdep.c b/sound/firewire/digi00x/digi00x-hwdep.c
index a084c2a834db9..426cd39e02338 100644
--- a/sound/firewire/digi00x/digi00x-hwdep.c
+++ b/sound/firewire/digi00x/digi00x-hwdep.c
@@ -70,7 +70,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&dg00x->lock);
 	if (dg00x->dev_lock_changed || dg00x->msg)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&dg00x->lock);
diff --git a/sound/firewire/fireface/ff-hwdep.c b/sound/firewire/fireface/ff-hwdep.c
index 68e273fa5d233..336c0076ec421 100644
--- a/sound/firewire/fireface/ff-hwdep.c
+++ b/sound/firewire/fireface/ff-hwdep.c
@@ -62,7 +62,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&ff->lock);
 	if (ff->dev_lock_changed)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&ff->lock);
diff --git a/sound/firewire/fireworks/fireworks_hwdep.c b/sound/firewire/fireworks/fireworks_hwdep.c
index e0eff9328ee10..5cac26ab20b71 100644
--- a/sound/firewire/fireworks/fireworks_hwdep.c
+++ b/sound/firewire/fireworks/fireworks_hwdep.c
@@ -194,12 +194,12 @@ hwdep_poll(struct snd_hwdep *hwdep, struct file *file, poll_table *wait)
 
 	spin_lock_irq(&efw->lock);
 	if (efw->dev_lock_changed || efw->pull_ptr != efw->push_ptr)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&efw->lock);
 
-	return events | POLLOUT;
+	return events | EPOLLOUT;
 }
 
 static int
diff --git a/sound/firewire/motu/motu-hwdep.c b/sound/firewire/motu/motu-hwdep.c
index 7b6a086866e7a..5f772eab588ba 100644
--- a/sound/firewire/motu/motu-hwdep.c
+++ b/sound/firewire/motu/motu-hwdep.c
@@ -69,12 +69,12 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&motu->lock);
 	if (motu->dev_lock_changed || motu->msg)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&motu->lock);
 
-	return events | POLLOUT;
+	return events | EPOLLOUT;
 }
 
 static int hwdep_get_info(struct snd_motu *motu, void __user *arg)
diff --git a/sound/firewire/oxfw/oxfw-hwdep.c b/sound/firewire/oxfw/oxfw-hwdep.c
index 6c1828aff672d..50a1c03b42b9a 100644
--- a/sound/firewire/oxfw/oxfw-hwdep.c
+++ b/sound/firewire/oxfw/oxfw-hwdep.c
@@ -62,7 +62,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&oxfw->lock);
 	if (oxfw->dev_lock_changed)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&oxfw->lock);
diff --git a/sound/firewire/tascam/tascam-hwdep.c b/sound/firewire/tascam/tascam-hwdep.c
index 37b21647b4716..4e4c1e9020e8c 100644
--- a/sound/firewire/tascam/tascam-hwdep.c
+++ b/sound/firewire/tascam/tascam-hwdep.c
@@ -60,7 +60,7 @@ static __poll_t hwdep_poll(struct snd_hwdep *hwdep, struct file *file,
 
 	spin_lock_irq(&tscm->lock);
 	if (tscm->dev_lock_changed)
-		events = POLLIN | POLLRDNORM;
+		events = EPOLLIN | EPOLLRDNORM;
 	else
 		events = 0;
 	spin_unlock_irq(&tscm->lock);
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index 6b57f8aac1b70..8c0f8a9ee0bae 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -684,7 +684,7 @@ static __poll_t sq_poll(struct file *file, struct poll_table_struct *wait)
 		poll_wait(file, &write_sq.action_queue, wait);
 	if (file->f_mode & FMODE_WRITE)
 		if (write_sq.count < write_sq.max_active || write_sq.block_size - write_sq.rear_size > 0)
-			mask |= POLLOUT | POLLWRNORM;
+			mask |= EPOLLOUT | EPOLLWRNORM;
 	return mask;
 
 }
diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c
index 05ccc7fdcc09f..56537a1565800 100644
--- a/sound/usb/mixer_quirks.c
+++ b/sound/usb/mixer_quirks.c
@@ -246,7 +246,7 @@ static __poll_t snd_usb_sbrc_hwdep_poll(struct snd_hwdep *hw, struct file *file,
 	struct usb_mixer_interface *mixer = hw->private_data;
 
 	poll_wait(file, &mixer->rc_waitq, wait);
-	return mixer->rc_code ? POLLIN | POLLRDNORM : 0;
+	return mixer->rc_code ? EPOLLIN | EPOLLRDNORM : 0;
 }
 
 static int snd_usb_soundblaster_remote_init(struct usb_mixer_interface *mixer)
diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c
index e2be10d171183..ebcab5c5465d2 100644
--- a/sound/usb/usx2y/us122l.c
+++ b/sound/usb/usx2y/us122l.c
@@ -280,7 +280,7 @@ static __poll_t usb_stream_hwdep_poll(struct snd_hwdep *hw,
 
 	poll_wait(file, &us122l->sk.sleep, wait);
 
-	mask = POLLIN | POLLOUT | POLLWRNORM | POLLERR;
+	mask = EPOLLIN | EPOLLOUT | EPOLLWRNORM | EPOLLERR;
 	if (mutex_trylock(&us122l->mutex)) {
 		struct usb_stream *s = us122l->sk.s;
 		if (s && s->state == usb_stream_ready) {
@@ -290,7 +290,7 @@ static __poll_t usb_stream_hwdep_poll(struct snd_hwdep *hw,
 				polled = &us122l->second_periods_polled;
 			if (*polled != s->periods_done) {
 				*polled = s->periods_done;
-				mask = POLLIN | POLLOUT | POLLWRNORM;
+				mask = EPOLLIN | EPOLLOUT | EPOLLWRNORM;
 			} else
 				mask = 0;
 		}
diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c
index 07d15bae75bce..d8bd7c99b48c9 100644
--- a/sound/usb/usx2y/usX2Yhwdep.c
+++ b/sound/usb/usx2y/usX2Yhwdep.c
@@ -92,12 +92,12 @@ static __poll_t snd_us428ctls_poll(struct snd_hwdep *hw, struct file *file, poll
 	struct usX2Ydev	*us428 = hw->private_data;
 	struct us428ctls_sharedmem *shm = us428->us428ctls_sharedmem;
 	if (us428->chip_status & USX2Y_STAT_CHIP_HUP)
-		return POLLHUP;
+		return EPOLLHUP;
 
 	poll_wait(file, &us428->us428ctls_wait_queue_head, wait);
 
 	if (shm != NULL && shm->CtlSnapShotLast != shm->CtlSnapShotRed)
-		mask |= POLLIN;
+		mask |= EPOLLIN;
 
 	return mask;
 }
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a334399fafec5..6e865e8b5b109 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -194,7 +194,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 	unsigned seq;
 	int idx;
 
-	if (flags & POLLIN) {
+	if (flags & EPOLLIN) {
 		idx = srcu_read_lock(&kvm->irq_srcu);
 		do {
 			seq = read_seqcount_begin(&irqfd->irq_entry_sc);
@@ -208,7 +208,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
 
-	if (flags & POLLHUP) {
+	if (flags & EPOLLHUP) {
 		/* The eventfd is closing, detach from KVM */
 		unsigned long flags;
 
@@ -399,12 +399,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 */
 	events = f.file->f_op->poll(f.file, &irqfd->pt);
 
-	if (events & POLLIN)
+	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
 
 	/*
 	 * do not drop the file until the irqfd is fully initialized, otherwise
-	 * we might race against the POLLHUP
+	 * we might race against the EPOLLHUP
 	 */
 	fdput(f);
 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS

From 7a163b2195cda0cddf47b5caf14a7229d4e2bea3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 1 Feb 2018 15:13:18 -0500
Subject: [PATCH 497/676] unify {de,}mangle_poll(), get rid of kernel-side
 POLL...

except, again, POLLFREE and POLL_BUSY_LOOP.

With this, we finally get to the promised end result:

 - POLL{IN,OUT,...} are plain integers and *not* in __poll_t, so any
   stray instances of ->poll() still using those will be caught by
   sparse.

 - eventpoll.c and select.c warning-free wrt __poll_t

 - no more kernel-side definitions of POLL... - userland ones are
   visible through the entire kernel (and used pretty much only for
   mangle/demangle)

 - same behavior as after the first series (i.e. sparc et.al. epoll(2)
   working correctly).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/blackfin/include/uapi/asm/poll.h | 19 +------------
 arch/frv/include/uapi/asm/poll.h      | 19 +------------
 arch/m68k/include/uapi/asm/poll.h     | 19 +------------
 arch/mips/include/uapi/asm/poll.h     | 19 +------------
 arch/sparc/include/uapi/asm/poll.h    | 28 +++----------------
 arch/xtensa/include/uapi/asm/poll.h   | 21 ++------------
 include/linux/poll.h                  | 24 ++++++++++++++++
 include/uapi/asm-generic/poll.h       | 40 +++++++++------------------
 8 files changed, 47 insertions(+), 142 deletions(-)

diff --git a/arch/blackfin/include/uapi/asm/poll.h b/arch/blackfin/include/uapi/asm/poll.h
index 3b162f2d29703..cd2f1a78aba57 100644
--- a/arch/blackfin/include/uapi/asm/poll.h
+++ b/arch/blackfin/include/uapi/asm/poll.h
@@ -9,25 +9,8 @@
 #ifndef _UAPI__BFIN_POLL_H
 #define _UAPI__BFIN_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)256
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-	/* bit 9 -> bit 8, bit 8 -> bit 2 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6);
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6));
-}
-#endif
+#define POLLWRBAND	256
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/frv/include/uapi/asm/poll.h b/arch/frv/include/uapi/asm/poll.h
index a44c8f0ebee79..f55b45f475eca 100644
--- a/arch/frv/include/uapi/asm/poll.h
+++ b/arch/frv/include/uapi/asm/poll.h
@@ -2,25 +2,8 @@
 #ifndef _ASM_POLL_H
 #define _ASM_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)256
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-	/* bit 9 -> bit 8, bit 8 -> bit 2 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6);
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6));
-}
-#endif
+#define POLLWRBAND	256
 
 #include <asm-generic/poll.h>
 #undef POLLREMOVE
diff --git a/arch/m68k/include/uapi/asm/poll.h b/arch/m68k/include/uapi/asm/poll.h
index d8be239e81417..c3e3fcc15e1dc 100644
--- a/arch/m68k/include/uapi/asm/poll.h
+++ b/arch/m68k/include/uapi/asm/poll.h
@@ -2,25 +2,8 @@
 #ifndef __m68k_POLL_H
 #define __m68k_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)256
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-	/* bit 9 -> bit 8, bit 8 -> bit 2 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6);
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6));
-}
-#endif
+#define POLLWRBAND	256
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/mips/include/uapi/asm/poll.h b/arch/mips/include/uapi/asm/poll.h
index 3173f89171284..ad289d7b74340 100644
--- a/arch/mips/include/uapi/asm/poll.h
+++ b/arch/mips/include/uapi/asm/poll.h
@@ -2,25 +2,8 @@
 #ifndef __ASM_POLL_H
 #define __ASM_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)0x0100
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-	/* bit 9 -> bit 8, bit 8 -> bit 2 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6);
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6));
-}
-#endif
+#define POLLWRBAND	0x0100
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/sparc/include/uapi/asm/poll.h b/arch/sparc/include/uapi/asm/poll.h
index 2a81e79aa3ea6..72356c9991257 100644
--- a/arch/sparc/include/uapi/asm/poll.h
+++ b/arch/sparc/include/uapi/asm/poll.h
@@ -2,31 +2,11 @@
 #ifndef __SPARC_POLL_H
 #define __SPARC_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)256
-#define POLLMSG		(__force __poll_t)512
-#define POLLREMOVE	(__force __poll_t)1024
-#define POLLRDHUP       (__force __poll_t)2048
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-        /* bit 9 -> bit 8, bit 8 -> bit 2, bit 13 -> bit 11 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6) |
-				((v & 0x2000) >> 2);
-
-
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6) | ((v & 0x800) << 2));
-}
-#endif
+#define POLLWRBAND	256
+#define POLLMSG		512
+#define POLLREMOVE	1024
+#define POLLRDHUP       2048
 
 #include <asm-generic/poll.h>
 
diff --git a/arch/xtensa/include/uapi/asm/poll.h b/arch/xtensa/include/uapi/asm/poll.h
index e3246d41182c2..4d249040b33d2 100644
--- a/arch/xtensa/include/uapi/asm/poll.h
+++ b/arch/xtensa/include/uapi/asm/poll.h
@@ -12,26 +12,9 @@
 #ifndef _XTENSA_POLL_H
 #define _XTENSA_POLL_H
 
-#ifndef __KERNEL__
 #define POLLWRNORM	POLLOUT
-#define POLLWRBAND	(__force __poll_t)0x0100
-#define POLLREMOVE	(__force __poll_t)0x0800
-#else
-#define __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	__u16 v = (__force __u16)val;
-	/* bit 9 -> bit 8, bit 8 -> bit 2 */
-	return (v & ~0x300) | ((v & 0x200) >> 1) | ((v & 0x100) >> 6);
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-        /* bit 8 -> bit 9, bit 2 -> bits 2 and 8 */
-	return (__force __poll_t)((v & ~0x100) | ((v & 0x100) << 1) |
-				((v & 4) << 6));
-}
-#endif
+#define POLLWRBAND	0x0100
+#define POLLREMOVE	0x0800
 
 #include <asm-generic/poll.h>
 
diff --git a/include/linux/poll.h b/include/linux/poll.h
index d23104b329317..f45ebd017eaa9 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -108,4 +108,28 @@ extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
 				   long nsec);
 
+#define __MAP(v, from, to) \
+	(from < to ? (v & from) * (to/from) : (v & from) / (from/to))
+
+static inline __u16 mangle_poll(__poll_t val)
+{
+	__u16 v = (__force __u16)val;
+#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
+	return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
+		M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
+		M(HUP) | M(RDHUP) | M(MSG);
+#undef M
+}
+
+static inline __poll_t demangle_poll(u16 val)
+{
+#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
+	return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
+		M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
+		M(HUP) | M(RDHUP) | M(MSG);
+#undef M
+}
+#undef __MAP
+
+
 #endif /* _LINUX_POLL_H */
diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h
index 639fade14b234..41b509f410bf9 100644
--- a/include/uapi/asm-generic/poll.h
+++ b/include/uapi/asm-generic/poll.h
@@ -3,50 +3,36 @@
 #define __ASM_GENERIC_POLL_H
 
 /* These are specified by iBCS2 */
-#define POLLIN		(__force __poll_t)0x0001
-#define POLLPRI		(__force __poll_t)0x0002
-#define POLLOUT		(__force __poll_t)0x0004
-#define POLLERR		(__force __poll_t)0x0008
-#define POLLHUP		(__force __poll_t)0x0010
-#define POLLNVAL	(__force __poll_t)0x0020
+#define POLLIN		0x0001
+#define POLLPRI		0x0002
+#define POLLOUT		0x0004
+#define POLLERR		0x0008
+#define POLLHUP		0x0010
+#define POLLNVAL	0x0020
 
 /* The rest seem to be more-or-less nonstandard. Check them! */
-#define POLLRDNORM	(__force __poll_t)0x0040
-#define POLLRDBAND	(__force __poll_t)0x0080
+#define POLLRDNORM	0x0040
+#define POLLRDBAND	0x0080
 #ifndef POLLWRNORM
-#define POLLWRNORM	(__force __poll_t)0x0100
+#define POLLWRNORM	0x0100
 #endif
 #ifndef POLLWRBAND
-#define POLLWRBAND	(__force __poll_t)0x0200
+#define POLLWRBAND	0x0200
 #endif
 #ifndef POLLMSG
-#define POLLMSG		(__force __poll_t)0x0400
+#define POLLMSG		0x0400
 #endif
 #ifndef POLLREMOVE
-#define POLLREMOVE	(__force __poll_t)0x1000
+#define POLLREMOVE	0x1000
 #endif
 #ifndef POLLRDHUP
-#define POLLRDHUP       (__force __poll_t)0x2000
+#define POLLRDHUP       0x2000
 #endif
 
 #define POLLFREE	(__force __poll_t)0x4000	/* currently only for epoll */
 
 #define POLL_BUSY_LOOP	(__force __poll_t)0x8000
 
-#ifdef __KERNEL__
-#ifndef __ARCH_HAS_MANGLED_POLL
-static inline __u16 mangle_poll(__poll_t val)
-{
-	return (__force __u16)val;
-}
-
-static inline __poll_t demangle_poll(__u16 v)
-{
-	return (__force __poll_t)v;
-}
-#endif
-#endif
-
 struct pollfd {
 	int fd;
 	short events;

From 7928b2cbe55b2a410a0f5c1f154610059c57b1b2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 11 Feb 2018 15:04:29 -0800
Subject: [PATCH 498/676] Linux 4.16-rc1

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index cd9145c0878d6..79ad2bfa24b68 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 4
-PATCHLEVEL = 15
+PATCHLEVEL = 16
 SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*

From 1f8ade92a83696986ad34438ce11e38975d1a43d Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Fri, 9 Feb 2018 00:15:36 +0100
Subject: [PATCH 499/676] ALSA: ac97: kconfig: Remove select of undefined
 symbol AC97

The AC97_BUS_NEW Kconfig symbol selects the globally undefined symbol
AC97.

Robert Jarzmik confirmed in https://lkml.org/lkml/2018/2/7/96 that the
select was put in by mistake and can be safely removed, with no other
changes required. Remove it.

Fixes: 74426fbff66e ("ALSA: ac97: add an ac97 bus")
Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/ac97/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sound/ac97/Kconfig b/sound/ac97/Kconfig
index f8a64e15e5bfb..baa5f8ef89d25 100644
--- a/sound/ac97/Kconfig
+++ b/sound/ac97/Kconfig
@@ -5,7 +5,6 @@
 
 config AC97_BUS_NEW
 	tristate
-	select AC97
 	help
 	  This is the new AC97 bus type, successor of AC97_BUS. The ported
 	  drivers which benefit from the AC97 automatic probing should "select"

From 5e35dc0338d85ccebacf3f77eca1e5dea73155e8 Mon Sep 17 00:00:00 2001
From: Lassi Ylikojola <lassi.ylikojola@gmail.com>
Date: Fri, 9 Feb 2018 16:51:36 +0200
Subject: [PATCH 500/676] ALSA: usb-audio: add implicit fb quirk for Behringer
 UFX1204

Add quirk to ensure a sync endpoint is properly configured.
This patch is a fix for same symptoms on Behringer UFX1204 as patch
from Albertto Aquirre on Dec 8 2016 for Axe-Fx II.

Signed-off-by: Lassi Ylikojola <lassi.ylikojola@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/pcm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c
index b9c9a19f9588a..3cbfae6604f98 100644
--- a/sound/usb/pcm.c
+++ b/sound/usb/pcm.c
@@ -352,6 +352,15 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs,
 		ep = 0x86;
 		iface = usb_ifnum_to_if(dev, 2);
 
+		if (!iface || iface->num_altsetting == 0)
+			return -EINVAL;
+
+		alts = &iface->altsetting[1];
+		goto add_sync_ep;
+	case USB_ID(0x1397, 0x0002):
+		ep = 0x81;
+		iface = usb_ifnum_to_if(dev, 1);
+
 		if (!iface || iface->num_altsetting == 0)
 			return -EINVAL;
 

From 2bda7141b89aa35308da69aac7f486fa81db73ba Mon Sep 17 00:00:00 2001
From: Matthias Lange <matthias.lange@kernkonzept.com>
Date: Wed, 31 Jan 2018 18:39:12 +0100
Subject: [PATCH 501/676] ALSA: ac97: Fix copy and paste typo in documentation

It's 'optional' instead of 'optinal'.

Signed-off-by: Matthias Lange <matthias.lange@kernkonzept.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/ac97/regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/sound/ac97/regs.h b/include/sound/ac97/regs.h
index 4bb86d379bd58..9a4fa0c3264aa 100644
--- a/include/sound/ac97/regs.h
+++ b/include/sound/ac97/regs.h
@@ -31,7 +31,7 @@
 #define AC97_HEADPHONE		0x04	/* Headphone Volume (optional) */
 #define AC97_MASTER_MONO	0x06	/* Master Volume Mono (optional) */
 #define AC97_MASTER_TONE	0x08	/* Master Tone (Bass & Treble) (optional) */
-#define AC97_PC_BEEP		0x0a	/* PC Beep Volume (optinal) */
+#define AC97_PC_BEEP		0x0a	/* PC Beep Volume (optional) */
 #define AC97_PHONE		0x0c	/* Phone Volume (optional) */
 #define AC97_MIC		0x0e	/* MIC Volume */
 #define AC97_LINE		0x10	/* Line In Volume */

From 447cae58cecd69392b74a4a42cd0ab9cabd816af Mon Sep 17 00:00:00 2001
From: Kirill Marinushkin <k.marinushkin@gmail.com>
Date: Mon, 29 Jan 2018 06:37:55 +0100
Subject: [PATCH 502/676] ALSA: usb-audio: Fix UAC2 get_ctl request with a
 RANGE attribute

The layout of the UAC2 Control request and response varies depending on
the request type. With the current implementation, only the Layout 2
Parameter Block (with the 2-byte sized RANGE attribute) is handled
properly. For the Control requests with the 1-byte sized RANGE attribute
(Bass Control, Mid Control, Tremble Control), the response is parsed
incorrectly.

This commit:
* fixes the wLength field value in the request
* fixes parsing the range values from the response

Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0")
Signed-off-by: Kirill Marinushkin <k.marinushkin@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/mixer.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 9afb8ab524c7e..06b22624ab7a0 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -347,17 +347,20 @@ static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request,
 			    int validx, int *value_ret)
 {
 	struct snd_usb_audio *chip = cval->head.mixer->chip;
-	unsigned char buf[4 + 3 * sizeof(__u32)]; /* enough space for one range */
+	/* enough space for one range */
+	unsigned char buf[sizeof(__u16) + 3 * sizeof(__u32)];
 	unsigned char *val;
-	int idx = 0, ret, size;
+	int idx = 0, ret, val_size, size;
 	__u8 bRequest;
 
+	val_size = uac2_ctl_value_size(cval->val_type);
+
 	if (request == UAC_GET_CUR) {
 		bRequest = UAC2_CS_CUR;
-		size = uac2_ctl_value_size(cval->val_type);
+		size = val_size;
 	} else {
 		bRequest = UAC2_CS_RANGE;
-		size = sizeof(buf);
+		size = sizeof(__u16) + 3 * val_size;
 	}
 
 	memset(buf, 0, sizeof(buf));
@@ -390,16 +393,17 @@ static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request,
 		val = buf + sizeof(__u16);
 		break;
 	case UAC_GET_MAX:
-		val = buf + sizeof(__u16) * 2;
+		val = buf + sizeof(__u16) + val_size;
 		break;
 	case UAC_GET_RES:
-		val = buf + sizeof(__u16) * 3;
+		val = buf + sizeof(__u16) + val_size * 2;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	*value_ret = convert_signed_value(cval, snd_usb_combine_bytes(val, sizeof(__u16)));
+	*value_ret = convert_signed_value(cval,
+					  snd_usb_combine_bytes(val, val_size));
 
 	return 0;
 }

From 7c74866baef1827e18f8269aec85030063520bd4 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Sun, 11 Feb 2018 09:50:27 -0400
Subject: [PATCH 503/676] ALSA: usb: add more device quirks for USB DSD devices

Add some more devices that need quirks to handle DSD modes correctly.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Reported-and-tested-by: Thomas Gresens <tgresens@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index a66ef5777887a..ea8f3de92fa4b 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1363,8 +1363,11 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip,
 			return SNDRV_PCM_FMTBIT_DSD_U32_BE;
 		break;
 
-	/* Amanero Combo384 USB interface with native DSD support */
-	case USB_ID(0x16d0, 0x071a):
+	/* Amanero Combo384 USB based DACs with native DSD support */
+	case USB_ID(0x16d0, 0x071a):  /* Amanero - Combo384 */
+	case USB_ID(0x2ab6, 0x0004):  /* T+A DAC8DSD-V2.0, MP1000E-V2.0, MP2000R-V2.0, MP2500R-V2.0, MP3100HV-V2.0 */
+	case USB_ID(0x2ab6, 0x0005):  /* T+A USB HD Audio 1 */
+	case USB_ID(0x2ab6, 0x0006):  /* T+A USB HD Audio 2 */
 		if (fp->altsetting == 2) {
 			switch (le16_to_cpu(chip->dev->descriptor.bcdDevice)) {
 			case 0x199:

From 3cd091a773936c54344a519f7ee1379ccb620bee Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 9 Feb 2018 22:55:28 +0100
Subject: [PATCH 504/676] ACPI / EC: Restore polling during noirq
 suspend/resume phases

Commit 662591461c4b (ACPI / EC: Drop EC noirq hooks to fix a
regression) modified the ACPI EC driver so that it doesn't switch
over to busy polling mode during noirq stages of system suspend and
resume in an attempt to fix an issue resulting from that behavior.

However, that modification introduced a system resume regression on
Thinkpad X240, so make the EC driver switch over to the polling mode
during noirq stages of system suspend and resume again, which
effectively reverts the problematic commit.

Fixes: 662591461c4b (ACPI / EC: Drop EC noirq hooks to fix a regression)
Link: https://bugzilla.kernel.org/show_bug.cgi?id=197863
Reported-by: Markus Demleitner <m@tfiu.de>
Tested-by: Markus Demleitner <m@tfiu.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ec.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index d9f38c645e4a1..30a5729565575 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1927,6 +1927,9 @@ static int acpi_ec_suspend_noirq(struct device *dev)
 	    ec->reference_count >= 1)
 		acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_DISABLE);
 
+	if (acpi_sleep_no_ec_events())
+		acpi_ec_enter_noirq(ec);
+
 	return 0;
 }
 
@@ -1934,6 +1937,9 @@ static int acpi_ec_resume_noirq(struct device *dev)
 {
 	struct acpi_ec *ec = acpi_driver_data(to_acpi_device(dev));
 
+	if (acpi_sleep_no_ec_events())
+		acpi_ec_leave_noirq(ec);
+
 	if (ec_no_wakeup && test_bit(EC_FLAGS_STARTED, &ec->flags) &&
 	    ec->reference_count >= 1)
 		acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_ENABLE);

From 5a9e59e8d9dd9586d78c244b9d96fb18156daad3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 9 Feb 2018 12:08:21 -0600
Subject: [PATCH 505/676] ACPI: SPCR: Mark expected switch fall-through in
 acpi_parse_spcr

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Addresses-Coverity-ID: 1465078
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/spcr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c
index 89e97d21a89ce..9d52743080a4f 100644
--- a/drivers/acpi/spcr.c
+++ b/drivers/acpi/spcr.c
@@ -115,6 +115,7 @@ int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console)
 			table->serial_port.access_width))) {
 		default:
 			pr_err("Unexpected SPCR Access Width.  Defaulting to byte size\n");
+			/* fall through */
 		case 8:
 			iotype = "mmio";
 			break;

From 4a823c0be80fa996234ebb41c80d40458b1bec1e Mon Sep 17 00:00:00 2001
From: Jia-Ju Bai <baijiaju1990@gmail.com>
Date: Fri, 26 Jan 2018 16:48:49 +0800
Subject: [PATCH 506/676] opp: cpu: Replace GFP_ATOMIC with GFP_KERNEL in
 dev_pm_opp_init_cpufreq_table

After checking all possible call chains to
dev_pm_opp_init_cpufreq_table() here,
my tool finds that this function is never called in atomic context,
namely never in an interrupt handler or holding a spinlock.
And dev_pm_opp_init_cpufreq_table() calls dev_pm_opp_get_opp_count(),
which calls mutex_lock that can sleep.
It indicates that atmtcp_v_send() can call functions which may sleep.
Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL.

This is found by a static analysis tool named DCNS written by myself.

Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c
index 2d87bc1adf38b..0c09107094350 100644
--- a/drivers/opp/cpu.c
+++ b/drivers/opp/cpu.c
@@ -55,7 +55,7 @@ int dev_pm_opp_init_cpufreq_table(struct device *dev,
 	if (max_opps <= 0)
 		return max_opps ? max_opps : -ENODATA;
 
-	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
+	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL);
 	if (!freq_table)
 		return -ENOMEM;
 

From 4222f38ca3b7ae30ace582077677cec8b88fac36 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:33 +0200
Subject: [PATCH 507/676] ACPI / bus: Do not traverse through non-existed
 device table

When __acpi_match_device() is called it would be possible to have
ACPI ID table a NULL pointer. To avoid potential dereference,
check for this before traverse.

While here, remove redundant 'else'.

Note, this patch implies a bit of refactoring acpi_of_match_device()
to return pointer to OF ID when matched followed by refactoring
__acpi_match_device() to return either ACPI or OF ID when matches.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 63 ++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 676c9788e1c82..f1384e107eed5 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -660,13 +660,15 @@ struct acpi_device *acpi_companion_match(const struct device *dev)
  * acpi_of_match_device - Match device object using the "compatible" property.
  * @adev: ACPI device object to match.
  * @of_match_table: List of device IDs to match against.
+ * @of_id: OF ID if matched
  *
  * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of
  * identifiers and a _DSD object with the "compatible" property, use that
  * property to match against the given list of identifiers.
  */
 static bool acpi_of_match_device(struct acpi_device *adev,
-				 const struct of_device_id *of_match_table)
+				 const struct of_device_id *of_match_table,
+				 const struct of_device_id **of_id)
 {
 	const union acpi_object *of_compatible, *obj;
 	int i, nval;
@@ -690,8 +692,11 @@ static bool acpi_of_match_device(struct acpi_device *adev,
 		const struct of_device_id *id;
 
 		for (id = of_match_table; id->compatible[0]; id++)
-			if (!strcasecmp(obj->string.pointer, id->compatible))
+			if (!strcasecmp(obj->string.pointer, id->compatible)) {
+				if (of_id)
+					*of_id = id;
 				return true;
+			}
 	}
 
 	return false;
@@ -762,10 +767,11 @@ static bool __acpi_match_device_cls(const struct acpi_device_id *id,
 	return true;
 }
 
-static const struct acpi_device_id *__acpi_match_device(
-	struct acpi_device *device,
-	const struct acpi_device_id *ids,
-	const struct of_device_id *of_ids)
+static bool __acpi_match_device(struct acpi_device *device,
+				const struct acpi_device_id *acpi_ids,
+				const struct of_device_id *of_ids,
+				const struct acpi_device_id **acpi_id,
+				const struct of_device_id **of_id)
 {
 	const struct acpi_device_id *id;
 	struct acpi_hardware_id *hwid;
@@ -775,30 +781,32 @@ static const struct acpi_device_id *__acpi_match_device(
 	 * driver for it.
 	 */
 	if (!device || !device->status.present)
-		return NULL;
+		return false;
 
 	list_for_each_entry(hwid, &device->pnp.ids, list) {
 		/* First, check the ACPI/PNP IDs provided by the caller. */
-		for (id = ids; id->id[0] || id->cls; id++) {
-			if (id->id[0] && !strcmp((char *) id->id, hwid->id))
-				return id;
-			else if (id->cls && __acpi_match_device_cls(id, hwid))
-				return id;
+		if (acpi_ids) {
+			for (id = acpi_ids; id->id[0] || id->cls; id++) {
+				if (id->id[0] && !strcmp((char *)id->id, hwid->id))
+					goto out_acpi_match;
+				if (id->cls && __acpi_match_device_cls(id, hwid))
+					goto out_acpi_match;
+			}
 		}
 
 		/*
 		 * Next, check ACPI_DT_NAMESPACE_HID and try to match the
 		 * "compatible" property if found.
-		 *
-		 * The id returned by the below is not valid, but the only
-		 * caller passing non-NULL of_ids here is only interested in
-		 * whether or not the return value is NULL.
 		 */
-		if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)
-		    && acpi_of_match_device(device, of_ids))
-			return id;
+		if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id))
+			return acpi_of_match_device(device, of_ids, of_id);
 	}
-	return NULL;
+	return false;
+
+out_acpi_match:
+	if (acpi_id)
+		*acpi_id = id;
+	return true;
 }
 
 /**
@@ -815,7 +823,10 @@ static const struct acpi_device_id *__acpi_match_device(
 const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 					       const struct device *dev)
 {
-	return __acpi_match_device(acpi_companion_match(dev), ids, NULL);
+	const struct acpi_device_id *id = NULL;
+
+	__acpi_match_device(acpi_companion_match(dev), ids, NULL, &id, NULL);
+	return id;
 }
 EXPORT_SYMBOL_GPL(acpi_match_device);
 
@@ -840,7 +851,7 @@ EXPORT_SYMBOL_GPL(acpi_get_match_data);
 int acpi_match_device_ids(struct acpi_device *device,
 			  const struct acpi_device_id *ids)
 {
-	return __acpi_match_device(device, ids, NULL) ? 0 : -ENOENT;
+	return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT;
 }
 EXPORT_SYMBOL(acpi_match_device_ids);
 
@@ -849,10 +860,12 @@ bool acpi_driver_match_device(struct device *dev,
 {
 	if (!drv->acpi_match_table)
 		return acpi_of_match_device(ACPI_COMPANION(dev),
-					    drv->of_match_table);
+					    drv->of_match_table,
+					    NULL);
 
-	return !!__acpi_match_device(acpi_companion_match(dev),
-				     drv->acpi_match_table, drv->of_match_table);
+	return __acpi_match_device(acpi_companion_match(dev),
+				   drv->acpi_match_table, drv->of_match_table,
+				   NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(acpi_driver_match_device);
 

From 8ff277c5bf87d750a44a656d4f113462493acbfc Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:34 +0200
Subject: [PATCH 508/676] ACPI / bus: Remove checks in acpi_get_match_data()

As well as its sibling of_device_get_match_data() has no such checks,
no need to do it in acpi_get_match_data().

First of all, we are not supposed to call fwnode API like this without
driver attached.

Second, since __acpi_match_device() does check input parameter there is
no need to duplicate it outside.

And last but not least one, the API should still serve the cases when
ACPI device is enumerated via PRP0001. In such case driver has neither
ACPI table nor driver data there.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index f1384e107eed5..ca4af098b1bfb 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -834,12 +834,6 @@ void *acpi_get_match_data(const struct device *dev)
 {
 	const struct acpi_device_id *match;
 
-	if (!dev->driver)
-		return NULL;
-
-	if (!dev->driver->acpi_match_table)
-		return NULL;
-
 	match = acpi_match_device(dev->driver->acpi_match_table, dev);
 	if (!match)
 		return NULL;

From 29d5325a14ab49375476e3a6442ff40a008a8c9a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:35 +0200
Subject: [PATCH 509/676] ACPI / bus: Rename acpi_get_match_data() to
 acpi_device_get_match_data()

Do the renaming to be consistent with its sibling, i.e.
of_device_get_match_data().

No functional change.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c      | 4 ++--
 drivers/acpi/property.c | 2 +-
 include/linux/acpi.h    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index ca4af098b1bfb..e6285b5ce0d51 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 }
 EXPORT_SYMBOL_GPL(acpi_match_device);
 
-void *acpi_get_match_data(const struct device *dev)
+void *acpi_device_get_match_data(const struct device *dev)
 {
 	const struct acpi_device_id *match;
 
@@ -840,7 +840,7 @@ void *acpi_get_match_data(const struct device *dev)
 
 	return (void *)match->driver_data;
 }
-EXPORT_SYMBOL_GPL(acpi_get_match_data);
+EXPORT_SYMBOL_GPL(acpi_device_get_match_data);
 
 int acpi_match_device_ids(struct acpi_device *device,
 			  const struct acpi_device_id *ids)
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 466d1503aba0e..f9b5fa230a861 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1275,7 +1275,7 @@ static void *
 acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				  const struct device *dev)
 {
-	return acpi_get_match_data(dev);
+	return acpi_device_get_match_data(dev);
 }
 
 #define DECLARE_ACPI_FWNODE_OPS(ops) \
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 64e10746f2828..bdf47e0f92e92 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
 const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 					       const struct device *dev);
 
-void *acpi_get_match_data(const struct device *dev);
+void *acpi_device_get_match_data(const struct device *dev);
 extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
@@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device(
 	return NULL;
 }
 
-static inline void *acpi_get_match_data(const struct device *dev)
+static inline void *acpi_device_get_match_data(const struct device *dev)
 {
 	return NULL;
 }

From 67dcc26d208ca5578f08c3c78cb254418c24e9ec Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 9 Feb 2018 17:38:36 +0200
Subject: [PATCH 510/676] device property: Constify device_get_match_data()

Constify device_get_match_data() as OF and ACPI variants return
constant value.

Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c       | 4 ++--
 drivers/acpi/property.c  | 2 +-
 drivers/base/property.c  | 5 ++---
 drivers/of/property.c    | 4 ++--
 include/linux/acpi.h     | 4 ++--
 include/linux/fwnode.h   | 4 ++--
 include/linux/property.h | 2 +-
 7 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index e6285b5ce0d51..0dad0bd9327b5 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 }
 EXPORT_SYMBOL_GPL(acpi_match_device);
 
-void *acpi_device_get_match_data(const struct device *dev)
+const void *acpi_device_get_match_data(const struct device *dev)
 {
 	const struct acpi_device_id *match;
 
@@ -838,7 +838,7 @@ void *acpi_device_get_match_data(const struct device *dev)
 	if (!match)
 		return NULL;
 
-	return (void *)match->driver_data;
+	return (const void *)match->driver_data;
 }
 EXPORT_SYMBOL_GPL(acpi_device_get_match_data);
 
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index f9b5fa230a861..5815356ea6ad3 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1271,7 +1271,7 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 	return 0;
 }
 
-static void *
+static const void *
 acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				  const struct device *dev)
 {
diff --git a/drivers/base/property.c b/drivers/base/property.c
index 302236281d830..8f205f6461ed8 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1410,9 +1410,8 @@ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 }
 EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
 
-void *device_get_match_data(struct device *dev)
+const void *device_get_match_data(struct device *dev)
 {
-	return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data,
-				  dev);
+	return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev);
 }
 EXPORT_SYMBOL_GPL(device_get_match_data);
diff --git a/drivers/of/property.c b/drivers/of/property.c
index 36ed84e26d9c2..f46828e3b082b 100644
--- a/drivers/of/property.c
+++ b/drivers/of/property.c
@@ -977,11 +977,11 @@ static int of_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode,
 	return 0;
 }
 
-static void *
+static const void *
 of_fwnode_device_get_match_data(const struct fwnode_handle *fwnode,
 				const struct device *dev)
 {
-	return (void *)of_device_get_match_data(dev);
+	return of_device_get_match_data(dev);
 }
 
 const struct fwnode_operations of_fwnode_ops = {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bdf47e0f92e92..968173ec2726d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *),
 const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids,
 					       const struct device *dev);
 
-void *acpi_device_get_match_data(const struct device *dev);
+const void *acpi_device_get_match_data(const struct device *dev);
 extern bool acpi_driver_match_device(struct device *dev,
 				     const struct device_driver *drv);
 int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *);
@@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device(
 	return NULL;
 }
 
-static inline void *acpi_device_get_match_data(const struct device *dev)
+static inline const void *acpi_device_get_match_data(const struct device *dev)
 {
 	return NULL;
 }
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 4fa1a489efe4c..4fe8f289b3f6f 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -73,8 +73,8 @@ struct fwnode_operations {
 	struct fwnode_handle *(*get)(struct fwnode_handle *fwnode);
 	void (*put)(struct fwnode_handle *fwnode);
 	bool (*device_is_available)(const struct fwnode_handle *fwnode);
-	void *(*device_get_match_data)(const struct fwnode_handle *fwnode,
-				       const struct device *dev);
+	const void *(*device_get_match_data)(const struct fwnode_handle *fwnode,
+					     const struct device *dev);
 	bool (*property_present)(const struct fwnode_handle *fwnode,
 				 const char *propname);
 	int (*property_read_int_array)(const struct fwnode_handle *fwnode,
diff --git a/include/linux/property.h b/include/linux/property.h
index 769d372c1edf6..2eea4b310fc28 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -283,7 +283,7 @@ bool device_dma_supported(struct device *dev);
 
 enum dev_dma_attr device_get_dma_attr(struct device *dev);
 
-void *device_get_match_data(struct device *dev);
+const void *device_get_match_data(struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
 

From ea56fb282368ea08c2a313af6b55cb597aec4db1 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Fri, 9 Feb 2018 13:21:42 +0100
Subject: [PATCH 511/676] mtd: nand: vf610: set correct ooblayout

With commit 3cf32d180227 ("mtd: nand: vf610: switch to
mtd_ooblayout_ops") the driver started to use the NAND cores
default large page ooblayout. However, shortly after commit
6a623e076944 ("mtd: nand: add ooblayout for old hamming layout")
changed the default layout to the old hamming layout, which is
not what vf610_nfc is using. Specify the default large page
layout explicitly.

Fixes: 6a623e076944 ("mtd: nand: add ooblayout for old hamming layout")
Cc: <stable@vger.kernel.org> # v4.12+
Signed-off-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/vf610_nfc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/mtd/nand/vf610_nfc.c b/drivers/mtd/nand/vf610_nfc.c
index 80d31a58e558c..f367144f3c6f3 100644
--- a/drivers/mtd/nand/vf610_nfc.c
+++ b/drivers/mtd/nand/vf610_nfc.c
@@ -752,10 +752,8 @@ static int vf610_nfc_probe(struct platform_device *pdev)
 		if (mtd->oobsize > 64)
 			mtd->oobsize = 64;
 
-		/*
-		 * mtd->ecclayout is not specified here because we're using the
-		 * default large page ECC layout defined in NAND core.
-		 */
+		/* Use default large page ECC layout defined in NAND core */
+		mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops);
 		if (chip->ecc.strength == 32) {
 			nfc->ecc_mode = ECC_60_BYTE;
 			chip->ecc.bytes = 60;

From f23def8038611fa362de345c540107c78edaa085 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 30 Jan 2018 14:23:21 +0100
Subject: [PATCH 512/676] mtd: nand: MTD_NAND_MARVELL should depend on HAS_DMA

If NO_DMA=y:

    ERROR: "bad_dma_ops" [drivers/mtd/nand/marvell_nand.ko] undefined!

Add a dependency on HAS_DMA to fix this.

Fixes: 02f26ecf8c772751 ("mtd: nand: add reworked Marvell NAND controller driver")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Miquel Raynal <miquel.raynal@free-electrons.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
---
 drivers/mtd/nand/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index e6b8c59f2c0da..736ac887303c8 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -328,7 +328,7 @@ config MTD_NAND_MARVELL
 	tristate "NAND controller support on Marvell boards"
 	depends on PXA3xx || ARCH_MMP || PLAT_ORION || ARCH_MVEBU || \
 		   COMPILE_TEST
-	depends on HAS_IOMEM
+	depends on HAS_IOMEM && HAS_DMA
 	help
 	  This enables the NAND flash controller driver for Marvell boards,
 	  including:

From b6d8ef86cb7b8b6920b6815ebf1352757d3adb87 Mon Sep 17 00:00:00 2001
From: Aishwarya Pant <aishpant@gmail.com>
Date: Wed, 7 Feb 2018 19:04:36 +0530
Subject: [PATCH 513/676] Documentation/ABI: update cpuidle sysfs documentation

Update cpuidle documentation using git logs and existing documentation
in Documentation/cpuidle/sysfs.txt. This might be useful for scripting
and tracking changes in the ABI.

Signed-off-by: Aishwarya Pant <aishpant@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../ABI/testing/sysfs-devices-system-cpu      | 77 ++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index bfd29bc8d37af..4ed63b6cfb155 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -108,6 +108,8 @@ Description:	CPU topology files that describe a logical CPU's relationship
 
 What:		/sys/devices/system/cpu/cpuidle/current_driver
 		/sys/devices/system/cpu/cpuidle/current_governer_ro
+		/sys/devices/system/cpu/cpuidle/available_governors
+		/sys/devices/system/cpu/cpuidle/current_governor
 Date:		September 2007
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	Discover cpuidle policy and mechanism
@@ -119,13 +121,84 @@ Description:	Discover cpuidle policy and mechanism
 		Idle policy (governor) is differentiated from idle mechanism
 		(driver)
 
-		current_driver: displays current idle mechanism
+		current_driver: (RO) displays current idle mechanism
 
-		current_governor_ro: displays current idle policy
+		current_governor_ro: (RO) displays current idle policy
+
+		With the cpuidle_sysfs_switch boot option enabled (meant for
+		developer testing), the following three attributes are visible
+		instead:
+
+		current_driver: same as described above
+
+		available_governors: (RO) displays a space separated list of
+		available governors
+
+		current_governor: (RW) displays current idle policy. Users can
+		switch the governor at runtime by writing to this file.
 
 		See files in Documentation/cpuidle/ for more information.
 
 
+What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/name
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/latency
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/power
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/time
+		/sys/devices/system/cpu/cpuX/cpuidle/stateN/usage
+Date:		September 2007
+KernelVersion:	v2.6.24
+Contact:	Linux power management list <linux-pm@vger.kernel.org>
+Description:
+		The directory /sys/devices/system/cpu/cpuX/cpuidle contains per
+		logical CPU specific cpuidle information for each online cpu X.
+		The processor idle states which are available for use have the
+		following attributes:
+
+		name: (RO) Name of the idle state (string).
+
+		latency: (RO) The latency to exit out of this idle state (in
+		microseconds).
+
+		power: (RO) The power consumed while in this idle state (in
+		milliwatts).
+
+		time: (RO) The total time spent in this idle state (in microseconds).
+
+		usage: (RO) Number of times this state was entered (a count).
+
+
+What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/desc
+Date:		February 2008
+KernelVersion:	v2.6.25
+Contact:	Linux power management list <linux-pm@vger.kernel.org>
+Description:
+		(RO) A small description about the idle state (string).
+
+
+What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/disable
+Date:		March 2012
+KernelVersion:	v3.10
+Contact:	Linux power management list <linux-pm@vger.kernel.org>
+Description:
+		(RW) Option to disable this idle state (bool). The behavior and
+		the effect of the disable variable depends on the implementation
+		of a particular governor. In the ladder governor, for example,
+		it is not coherent, i.e. if one is disabling a light state, then
+		all deeper states are disabled as well, but the disable variable
+		does not reflect it. Likewise, if one enables a deep state but a
+		lighter state still is disabled, then this has no effect.
+
+
+What:		/sys/devices/system/cpu/cpuX/cpuidle/stateN/residency
+Date:		March 2014
+KernelVersion:	v3.15
+Contact:	Linux power management list <linux-pm@vger.kernel.org>
+Description:
+		(RO) Display the target residency i.e. the minimum amount of
+		time (in microseconds) this cpu should spend in this idle state
+		to make the transition worth the effort.
+
+
 What:		/sys/devices/system/cpu/cpu#/cpufreq/*
 Date:		pre-git history
 Contact:	linux-pm@vger.kernel.org

From 69728051f5bf15efaf6edfbcfe1b5a49a2437918 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Fri, 9 Feb 2018 08:11:26 -0800
Subject: [PATCH 514/676] PM / wakeirq: Fix unbalanced IRQ enable for wakeirq

If a device is runtime PM suspended when we enter suspend and has
a dedicated wake IRQ, we can get the following warning:

WARNING: CPU: 0 PID: 108 at kernel/irq/manage.c:526 enable_irq+0x40/0x94
[  102.087860] Unbalanced enable for IRQ 147
...
(enable_irq) from [<c06117a8>] (dev_pm_arm_wake_irq+0x4c/0x60)
(dev_pm_arm_wake_irq) from [<c0618360>]
 (device_wakeup_arm_wake_irqs+0x58/0x9c)
(device_wakeup_arm_wake_irqs) from [<c0615948>]
(dpm_suspend_noirq+0x10/0x48)
(dpm_suspend_noirq) from [<c01ac7ac>]
(suspend_devices_and_enter+0x30c/0xf14)
(suspend_devices_and_enter) from [<c01adf20>]
(enter_state+0xad4/0xbd8)
(enter_state) from [<c01ad3ec>] (pm_suspend+0x38/0x98)
(pm_suspend) from [<c01ab3e8>] (state_store+0x68/0xc8)

This is because the dedicated wake IRQ for the device may have been
already enabled earlier by dev_pm_enable_wake_irq_check().  Fix the
issue by checking for runtime PM suspended status.

This issue can be easily reproduced by setting serial console log level
to zero, letting the serial console idle, and suspend the system from
an ssh terminal.  On resume, dmesg will have the warning above.

The reason why I have not run into this issue earlier has been that I
typically run my PM test cases from on a serial console instead over ssh.

Fixes: c84345597558 (PM / wakeirq: Enable dedicated wakeirq for suspend)
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeirq.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index a8ac86e4d79e7..6637fc319269b 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -321,7 +321,8 @@ void dev_pm_arm_wake_irq(struct wake_irq *wirq)
 		return;
 
 	if (device_may_wakeup(wirq->dev)) {
-		if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)
+		if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED &&
+		    !pm_runtime_status_suspended(wirq->dev))
 			enable_irq(wirq->irq);
 
 		enable_irq_wake(wirq->irq);
@@ -343,7 +344,8 @@ void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
 	if (device_may_wakeup(wirq->dev)) {
 		disable_irq_wake(wirq->irq);
 
-		if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)
+		if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED &&
+		    !pm_runtime_status_suspended(wirq->dev))
 			disable_irq_nosync(wirq->irq);
 	}
 }

From 433986c2c265d106d6a8e88006e0131fefc92b7b Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 10 Feb 2018 19:13:58 +0100
Subject: [PATCH 515/676] PM / runtime: Update links_count also if !CONFIG_SRCU

Commit baa8809f6097 (PM / runtime: Optimize the use of device links)
added an invocation of pm_runtime_drop_link() to __device_link_del().
However there are two variants of that function, one for CONFIG_SRCU and
another for !CONFIG_SRCU, and the commit only modified the former.

Fixes: baa8809f6097 (PM / runtime: Optimize the use of device links)
Cc: v4.10+ <stable@vger.kernel.org> # v4.10+
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index b2261f92f2f1c..5847364f25d96 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -310,6 +310,9 @@ static void __device_link_del(struct device_link *link)
 	dev_info(link->consumer, "Dropping the link to %s\n",
 		 dev_name(link->supplier));
 
+	if (link->flags & DL_FLAG_PM_RUNTIME)
+		pm_runtime_drop_link(link->consumer);
+
 	list_del(&link->s_node);
 	list_del(&link->c_node);
 	device_link_free(link);

From 6b4af818c7d7a35a861c94596e05e43596e5fd28 Mon Sep 17 00:00:00 2001
From: Aishwarya Pant <aishpant@gmail.com>
Date: Sat, 10 Feb 2018 14:27:19 +0530
Subject: [PATCH 516/676] ACPI / DPTF: Document dptf_power sysfs atttributes

The descriptions have been collected from git commit logs and reading
through code.

Signed-off-by: Aishwarya Pant <aishpant@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-platform-dptf | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-platform-dptf

diff --git a/Documentation/ABI/testing/sysfs-platform-dptf b/Documentation/ABI/testing/sysfs-platform-dptf
new file mode 100644
index 0000000000000..325dc0667dbb8
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-dptf
@@ -0,0 +1,40 @@
+What:		/sys/bus/platform/devices/INT3407:00/dptf_power/charger_type
+Date:		Jul, 2016
+KernelVersion:	v4.10
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) The charger type - Traditional, Hybrid or NVDC.
+
+What:		/sys/bus/platform/devices/INT3407:00/dptf_power/adapter_rating_mw
+Date:		Jul, 2016
+KernelVersion:	v4.10
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Adapter rating in milliwatts (the maximum Adapter power).
+		Must be 0 if no AC Adaptor is plugged in.
+
+What:		/sys/bus/platform/devices/INT3407:00/dptf_power/max_platform_power_mw
+Date:		Jul, 2016
+KernelVersion:	v4.10
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Maximum platform power that can be supported by the battery
+		in milliwatts.
+
+What:		/sys/bus/platform/devices/INT3407:00/dptf_power/platform_power_source
+Date:		Jul, 2016
+KernelVersion:	v4.10
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Display the platform power source
+		0x00 = DC
+		0x01 = AC
+		0x02 = USB
+		0x03 = Wireless Charger
+
+What:		/sys/bus/platform/devices/INT3407:00/dptf_power/battery_steady_power
+Date:		Jul, 2016
+KernelVersion:	v4.10
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) The maximum sustained power for battery in milliwatts.

From 22029845ad81033115910cdef35170de6a10a1eb Mon Sep 17 00:00:00 2001
From: Aishwarya Pant <aishpant@gmail.com>
Date: Sat, 10 Feb 2018 14:27:38 +0530
Subject: [PATCH 517/676] ACPI: dock: document sysfs interface

Description has been collected from git commit history and reading
through code.

Signed-off-by: Aishwarya Pant <aishpant@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../ABI/testing/sysfs-devices-platform-dock   | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-platform-dock

diff --git a/Documentation/ABI/testing/sysfs-devices-platform-dock b/Documentation/ABI/testing/sysfs-devices-platform-dock
new file mode 100644
index 0000000000000..1d8c18f905c7d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-platform-dock
@@ -0,0 +1,39 @@
+What:		/sys/devices/platform/dock.N/docked
+Date:		Dec, 2006
+KernelVersion:	2.6.19
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Value 1 or 0 indicates whether the software believes the
+		laptop is docked in a docking station.
+
+What:		/sys/devices/platform/dock.N/undock
+Date:		Dec, 2006
+KernelVersion:	2.6.19
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(WO) Writing to this file causes the software to initiate an
+		undock request to the firmware.
+
+What:		/sys/devices/platform/dock.N/uid
+Date:		Feb, 2007
+KernelVersion:	v2.6.21
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Displays the docking station the laptop is docked to.
+
+What:		/sys/devices/platform/dock.N/flags
+Date:		May, 2007
+KernelVersion:	v2.6.21
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Show dock station flags, useful for checking if undock
+		request has been made by the user (from the immediate_undock
+		option).
+
+What:		/sys/devices/platform/dock.N/type
+Date:		Aug, 2008
+KernelVersion:	v2.6.27
+Contact:	linux-acpi@vger.kernel.org
+Description:
+		(RO) Display the dock station type- dock_station, ata_bay or
+		battery_bay.

From d7212cfb05ba802bea4dd6c90d61cfe6366ea224 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 12 Feb 2018 11:34:22 +0100
Subject: [PATCH 518/676] PM: cpuidle: Fix cpuidle_poll_state_init() prototype

Commit f85942207516 (x86: PM: Make APM idle driver initialize polling
state) made apm_init() call cpuidle_poll_state_init(), but that only
is defined for CONFIG_CPU_IDLE set, so make the empty stub of it
available for CONFIG_CPU_IDLE unset too to fix the resulting build
issue.

Fixes: f85942207516 (x86: PM: Make APM idle driver initialize polling state)
Cc: 4.14+ <stable@vger.kernel.org> # 4.14+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpuidle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8f7788d23b573..a6989e02d0a06 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -225,7 +225,7 @@ static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev,
 }
 #endif
 
-#ifdef CONFIG_ARCH_HAS_CPU_RELAX
+#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_ARCH_HAS_CPU_RELAX)
 void cpuidle_poll_state_init(struct cpuidle_driver *drv);
 #else
 static inline void cpuidle_poll_state_init(struct cpuidle_driver *drv) {}

From 16e574d762ac5512eb922ac0ac5eed360b7db9d8 Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Sun, 11 Feb 2018 19:16:15 -0600
Subject: [PATCH 519/676] arm64: Add missing Falkor part number for branch
 predictor hardening

References to CPU part number MIDR_QCOM_FALKOR were dropped from the
mailing list patch due to mainline/arm64 branch dependency. So this
patch adds the missing part number.

Fixes: ec82b567a74f ("arm64: Implement branch predictor hardening for Falkor")
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpu_errata.c | 9 +++++++++
 arch/arm64/kvm/hyp/switch.c    | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 07823595b7f01..52f15cd896e11 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -406,6 +406,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_HARDEN_BP_POST_GUEST_EXIT,
 		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1),
 	},
+	{
+		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR),
+		.enable = qcom_enable_link_stack_sanitization,
+	},
+	{
+		.capability = ARM64_HARDEN_BP_POST_GUEST_EXIT,
+		MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR),
+	},
 	{
 		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
 		MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 116252a8d3a55..870f4b1587f97 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -407,8 +407,10 @@ int __hyp_text __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 		u32 midr = read_cpuid_id();
 
 		/* Apply BTAC predictors mitigation to all Falkor chips */
-		if ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)
+		if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) ||
+		    ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) {
 			__qcom_hyp_sanitize_btac_predictors();
+		}
 	}
 
 	fp_enabled = __fpsimd_enabled();

From 9d37c094dacda531ac3e529dd4dd139e3c0b7811 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Thu, 8 Feb 2018 19:39:20 +0000
Subject: [PATCH 520/676] ia64: fix build failure with CONFIG_SWIOTLB

arch/ia64/kernel/pci-swiotlb.c is removed in commit 4fac8076df85 ("ia64: clean up swiotlb support")
but pci-swiotlb.o is still present in Makefile, and so build fail when
CONFIG_SWIOTLB is enabled.
Fix the build failure by removing pci-swiotlb.o from makefile

Fixes: 4fac8076df85 ("ia64: clean up swiotlb support")
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/ia64/kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 0b4c65a1af25f..498f3da3f225d 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -41,7 +41,6 @@ ifneq ($(CONFIG_IA64_ESI),)
 obj-y				+= esi_stub.o	# must be in kernel proper
 endif
 obj-$(CONFIG_INTEL_IOMMU)	+= pci-dma.o
-obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 
 obj-$(CONFIG_BINFMT_ELF)	+= elfcore.o
 

From f25e6f6b4eae7e25e92e91a570cae84bf83e751a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Feb 2018 09:21:07 +0100
Subject: [PATCH 521/676] dma-direct: mark as is_phys

Various PCI_DMA_BUS_IS_PHYS implementations rely on this flag to make proper
decisions for block and networking addressability.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 lib/dma-direct.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index 40b1f92f2214e..fdc733cf9e30a 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -152,5 +152,6 @@ const struct dma_map_ops dma_direct_ops = {
 	.map_sg			= dma_direct_map_sg,
 	.dma_supported		= dma_direct_supported,
 	.mapping_error		= dma_direct_mapping_error,
+	.is_phys		= 1,
 };
 EXPORT_SYMBOL(dma_direct_ops);

From 42ed64524d846b96afaa8b3f9ba045bcaf11ab0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 2 Feb 2018 09:51:14 +0100
Subject: [PATCH 522/676] dma-direct: comment the dma_direct_free calling
 convention

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 lib/dma-direct.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/dma-direct.c b/lib/dma-direct.c
index fdc733cf9e30a..c9e8e21cb3340 100644
--- a/lib/dma-direct.c
+++ b/lib/dma-direct.c
@@ -84,6 +84,10 @@ void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	return page_address(page);
 }
 
+/*
+ * NOTE: this function must never look at the dma_addr argument, because we want
+ * to be able to use it as a helper for iommu implementations as well.
+ */
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs)
 {

From ecc2dc55ce79945c2e0a04977706a99dc4848229 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Feb 2018 09:43:49 +0100
Subject: [PATCH 523/676] dma-mapping: fix a comment typo

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 34fe8463d10ea..eb9eab4ecd6d7 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -578,7 +578,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 
 /*
  * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please
- * don't use this is new code.
+ * don't use this in new code.
  */
 #ifndef arch_dma_supported
 #define arch_dma_supported(dev, mask)	(1)

From 0a34e4668c508cbbc2d5ef2d9710b145e4c0b27d Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Thu, 11 Jan 2018 13:38:15 -0800
Subject: [PATCH 524/676] nvme: Don't use a stack buffer for keep-alive command

In nvme_keep_alive() we pass a request with a pointer to an NVMe command on
the stack into blk_execute_rq_nowait().  However, the block layer doesn't
guarantee that the request is fully queued before blk_execute_rq_nowait()
returns.  If not, and the request is queued after nvme_keep_alive() returns,
then we'll end up using stack memory that might have been overwritten to
form the NVMe command we pass to hardware.

Fix this by keeping a special command struct in the nvme_ctrl struct right
next to the delayed work struct used for keep-alives.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 8 +++-----
 drivers/nvme/host/nvme.h | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2fd8688cfa474..6d0490b477c96 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -796,13 +796,9 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 
 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
 {
-	struct nvme_command c;
 	struct request *rq;
 
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = nvme_admin_keep_alive;
-
-	rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
+	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED,
 			NVME_QID_ANY);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
@@ -834,6 +830,8 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
 		return;
 
 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27e31c00b306f..0521e4707d1cf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -183,6 +183,7 @@ struct nvme_ctrl {
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
+	struct nvme_command ka_cmd;
 	struct work_struct fw_act_work;
 
 	/* Power saving configuration */

From aef17ca1271948ee57cc39b2493d31110cc42625 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 7 Feb 2018 17:49:39 -0800
Subject: [PATCH 525/676] hwmon: (k10temp) Only apply temperature offset if
 result is positive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A user reports a really bad temperature on Ryzen 1950X.

k10temp-pci-00cb
Adapter: PCI adapter
temp1: +4294948.3°C (high = +70.0°C)

This will happen if the temperature reported by the chip is lower than
the offset temperature. This has been seen in the field if "Sense MI Skew"
and/or "Sense MI Offset" BIOS parameters were set to unexpected values.
Let's report a temperature of 0 degrees C in that case.

Fixes: 1b50b776355f ("hwmon: (k10temp) Add support for temperature offsets")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/k10temp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 06b4e1c78bd8f..4c6594a4661d7 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -129,7 +129,10 @@ static ssize_t temp1_input_show(struct device *dev,
 
 	data->read_tempreg(data->pdev, &regval);
 	temp = (regval >> 21) * 125;
-	temp -= data->temp_offset;
+	if (temp > data->temp_offset)
+		temp -= data->temp_offset;
+	else
+		temp = 0;
 
 	return sprintf(buf, "%u\n", temp);
 }

From 75b0e73023ef7994348d619e9adadab0e96bb195 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 8 Feb 2018 10:24:02 +0000
Subject: [PATCH 526/676] drm/i915/perf: Fix compiler warning for string
 truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/gpu/drm/i915/i915_oa_cflgt3.c: In function ‘i915_perf_load_test_config_cflgt3’:
drivers/gpu/drm/i915/i915_oa_cflgt3.c:87:2: error: ‘strncpy’ output truncated before terminating nul copying 36 bytes from a string of the same length [-Werror=stringop-truncation]

v2: strlcpy

Fixes: 4407eaa9b0cc ("drm/i915/perf: add support for Coffeelake GT3")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180208102403.5587-1-chris@chris-wilson.co.uk
(cherry picked from commit 43df81d324cdd7056ad0ce3df709aff8dce856b7)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_oa_cflgt3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_oa_cflgt3.c b/drivers/gpu/drm/i915/i915_oa_cflgt3.c
index 42ff06fe54a3a..792facdb6702b 100644
--- a/drivers/gpu/drm/i915/i915_oa_cflgt3.c
+++ b/drivers/gpu/drm/i915/i915_oa_cflgt3.c
@@ -84,9 +84,9 @@ show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf)
 void
 i915_perf_load_test_config_cflgt3(struct drm_i915_private *dev_priv)
 {
-	strncpy(dev_priv->perf.oa.test_config.uuid,
+	strlcpy(dev_priv->perf.oa.test_config.uuid,
 		"577e8e2c-3fa0-4875-8743-3538d585e3b0",
-		UUID_STRING_LEN);
+		sizeof(dev_priv->perf.oa.test_config.uuid));
 	dev_priv->perf.oa.test_config.id = 1;
 
 	dev_priv->perf.oa.test_config.mux_regs = mux_config_test_oa;

From 73b0fcd24ef1b8e20b7f6e6babcde540d96d0cb2 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 8 Feb 2018 10:24:03 +0000
Subject: [PATCH 527/676] drm/i915/perf: Fix compiler warning for string
 truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/gpu/drm/i915/i915_oa_cnl.c: In function ‘i915_perf_load_test_config_cnl’:
drivers/gpu/drm/i915/i915_oa_cnl.c:99:2: error: ‘strncpy’ output truncated before terminating nul copying 36 bytes from a string of the same length [-Werror=stringop-truncation]

v2: strlcpy

Fixes: 95690a02fb5d ("drm/i915/perf: enable perf support on CNL")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180208102403.5587-2-chris@chris-wilson.co.uk
(cherry picked from commit 020580ff8edd50e64ae1bf47e560c61e5e2f29fc)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_oa_cnl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_oa_cnl.c b/drivers/gpu/drm/i915/i915_oa_cnl.c
index ff0ac3627cc4b..ba9140c87cc0b 100644
--- a/drivers/gpu/drm/i915/i915_oa_cnl.c
+++ b/drivers/gpu/drm/i915/i915_oa_cnl.c
@@ -96,9 +96,9 @@ show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf)
 void
 i915_perf_load_test_config_cnl(struct drm_i915_private *dev_priv)
 {
-	strncpy(dev_priv->perf.oa.test_config.uuid,
+	strlcpy(dev_priv->perf.oa.test_config.uuid,
 		"db41edd4-d8e7-4730-ad11-b9a2d6833503",
-		UUID_STRING_LEN);
+		sizeof(dev_priv->perf.oa.test_config.uuid));
 	dev_priv->perf.oa.test_config.id = 1;
 
 	dev_priv->perf.oa.test_config.mux_regs = mux_config_test_oa;

From 33afe065b66f226ee5f90ab24ff55799c896e381 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 8 Feb 2018 08:51:51 +0000
Subject: [PATCH 528/676] drm/i915: Avoid truncation before clamping
 userspace's priority value

Userspace provides a 64b value for the priority, we need to be careful
to preserve the full range before validation to prevent truncation (and
letting an illegal value pass).

Reported-by: Antonio Argenziano <antonio.argenziano@intel.com>
Fixes: ac14fbd460d0 ("drm/i915/scheduler: Support user-defined priorities")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Michal Winiarski <michal.winiarski@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180208085151.11480-1-chris@chris-wilson.co.uk
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
(cherry picked from commit 11a18f631959fd1ca10856c836a827683536770c)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 648e7536ff51e..0c963fcf31ffd 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -803,7 +803,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 
 	case I915_CONTEXT_PARAM_PRIORITY:
 		{
-			int priority = args->value;
+			s64 priority = args->value;
 
 			if (args->size)
 				ret = -EINVAL;

From 7292b9e6586534fb43e4316ad8b508bf3d1212f7 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 12 Feb 2018 09:39:28 +0000
Subject: [PATCH 529/676] drm/i915: Don't wake the device up to check if the
 engine is asleep

If the entire device is powered off, we can safely assume that the
engine is also asleep (and idle).

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Fixes: a091d4ee931b ("drm/i915: Hold a wakeref for probing the ring registers")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180212093928.6005-1-chris@chris-wilson.co.uk
(cherry picked from commit 74d00d28a15c8452f65de0a9477b52d95639cc63)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/intel_engine_cs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index d790bdc227ffb..acc661aa9c0c4 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1458,7 +1458,9 @@ static bool ring_is_idle(struct intel_engine_cs *engine)
 	struct drm_i915_private *dev_priv = engine->i915;
 	bool idle = true;
 
-	intel_runtime_pm_get(dev_priv);
+	/* If the whole device is asleep, the engine must be idle */
+	if (!intel_runtime_pm_get_if_in_use(dev_priv))
+		return true;
 
 	/* First check that no commands are left in the ring */
 	if ((I915_READ_HEAD(engine) & HEAD_ADDR) !=

From d37fc6d360a404b208547ba112e7dabb6533c7fc Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Mon, 12 Feb 2018 15:27:34 +0000
Subject: [PATCH 530/676] x86/speculation: Correct Speculation Control
 microcode blacklist again

Arjan points out that the Intel document only clears the 0xc2 microcode
on *some* parts with CPUID 506E3 (INTEL_FAM6_SKYLAKE_DESKTOP stepping 3).
For the Skylake H/S platform it's OK but for Skylake E3 which has the
same CPUID it isn't (yet) cleared.

So removing it from the blacklist was premature. Put it back for now.

Also, Arjan assures me that the 0x84 microcode for Kaby Lake which was
featured in one of the early revisions of the Intel document was never
released to the public, and won't be until/unless it is also validated
as safe. So those can change to 0x80 which is what all *other* versions
of the doc have identified.

Once the retrospective testing of existing public microcodes is done, we
should be back into a mode where new microcodes are only released in
batches and we shouldn't even need to update the blacklist for those
anyway, so this tweaking of the list isn't expected to be a thing which
keeps happening.

Requested-by: Arjan van de Ven <arjan.van.de.ven@intel.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: kvm@vger.kernel.org
Cc: pbonzini@redhat.com
Link: http://lkml.kernel.org/r/1518449255-2182-1-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f73b8148dd551..ef796f14f7ae5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -116,13 +116,14 @@ struct sku_microcode {
 	u32 microcode;
 };
 static const struct sku_microcode spectre_bad_microcodes[] = {
-	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0B,	0x84 },
-	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0A,	0x84 },
-	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x09,	0x84 },
-	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x0A,	0x84 },
-	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x09,	0x84 },
+	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0B,	0x80 },
+	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x0A,	0x80 },
+	{ INTEL_FAM6_KABYLAKE_DESKTOP,	0x09,	0x80 },
+	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x0A,	0x80 },
+	{ INTEL_FAM6_KABYLAKE_MOBILE,	0x09,	0x80 },
 	{ INTEL_FAM6_SKYLAKE_X,		0x03,	0x0100013e },
 	{ INTEL_FAM6_SKYLAKE_X,		0x04,	0x0200003c },
+	{ INTEL_FAM6_SKYLAKE_DESKTOP,	0x03,	0xc2 },
 	{ INTEL_FAM6_BROADWELL_CORE,	0x04,	0x28 },
 	{ INTEL_FAM6_BROADWELL_GT3E,	0x01,	0x1b },
 	{ INTEL_FAM6_BROADWELL_XEON_D,	0x02,	0x14 },

From f208820a321f9b23d77d7eed89945d862d62a3ed Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 10 Feb 2018 23:39:23 +0000
Subject: [PATCH 531/676] Revert "x86/speculation: Simplify
 indirect_branch_prediction_barrier()"

This reverts commit 64e16720ea0879f8ab4547e3b9758936d483909b.

We cannot call C functions like that, without marking all the
call-clobbered registers as, well, clobbered. We might have got away
with it for now because the __ibp_barrier() function was *fairly*
unlikely to actually use any other registers. But no. Just no.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: jmattson@google.com
Cc: karahmed@amazon.de
Cc: kvm@vger.kernel.org
Cc: pbonzini@redhat.com
Cc: rkrcmar@redhat.com
Cc: sironi@amazon.de
Link: http://lkml.kernel.org/r/1518305967-31356-3-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 13 +++++++++----
 arch/x86/include/asm/processor.h     |  3 ---
 arch/x86/kernel/cpu/bugs.c           |  6 ------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 4d57894635f24..300cc159b4a0a 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -164,10 +164,15 @@ static inline void vmexit_fill_RSB(void)
 
 static inline void indirect_branch_prediction_barrier(void)
 {
-	alternative_input("",
-			  "call __ibp_barrier",
-			  X86_FEATURE_USE_IBPB,
-			  ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
+	asm volatile(ALTERNATIVE("",
+				 "movl %[msr], %%ecx\n\t"
+				 "movl %[val], %%eax\n\t"
+				 "movl $0, %%edx\n\t"
+				 "wrmsr",
+				 X86_FEATURE_USE_IBPB)
+		     : : [msr] "i" (MSR_IA32_PRED_CMD),
+			 [val] "i" (PRED_CMD_IBPB)
+		     : "eax", "ecx", "edx", "memory");
 }
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 513f9604c1929..99799fbd0f7e0 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -969,7 +969,4 @@ bool xen_set_default_idle(void);
 
 void stop_this_cpu(void *dummy);
 void df_debug(struct pt_regs *regs, long error_code);
-
-void __ibp_barrier(void);
-
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 71949bf2de5ad..61152aa533772 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -337,9 +337,3 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 		       spectre_v2_module_string());
 }
 #endif
-
-void __ibp_barrier(void)
-{
-	__wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
-}
-EXPORT_SYMBOL_GPL(__ibp_barrier);

From 928a4c39484281f8ca366f53a1db79330d058401 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 10 Feb 2018 23:39:24 +0000
Subject: [PATCH 532/676] KVM/x86: Reduce retpoline performance impact in
 slot_handle_level_range(), by always inlining iterator helper methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With retpoline, tight loops of "call this function for every XXX" are
very much pessimised by taking a prediction miss *every* time. This one
is by far the biggest contributor to the guest launch time with retpoline.

By marking the iterator slot_handle_…() functions always_inline, we can
ensure that the indirect function call can be optimised away into a
direct call and it actually generates slightly smaller code because
some of the other conditionals can get optimised away too.

Performance is now pretty close to what we see with nospectre_v2 on
the command line.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Filippo Sironi <sironi@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Filippo Sironi <sironi@amazon.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: jmattson@google.com
Cc: karahmed@amazon.de
Cc: kvm@vger.kernel.org
Cc: rkrcmar@redhat.com
Link: http://lkml.kernel.org/r/1518305967-31356-4-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2b8eb4da4d082..cc83bdcb65d19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5058,7 +5058,7 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
 
 /* The caller should hold mmu-lock before calling this function. */
-static bool
+static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, int start_level, int end_level,
 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
@@ -5088,7 +5088,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return flush;
 }
 
-static bool
+static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		  slot_level_handler fn, int start_level, int end_level,
 		  bool lock_flush_tlb)
@@ -5099,7 +5099,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		      slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -5107,7 +5107,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -5115,7 +5115,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 slot_level_handler fn, bool lock_flush_tlb)
 {

From 206587a9fb764d71f035dc7f6d3b6488f5d5b304 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Sat, 10 Feb 2018 23:39:25 +0000
Subject: [PATCH 533/676] X86/nVMX: Properly set spec_ctrl and pred_cmd before
 merging MSRs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These two variables should check whether SPEC_CTRL and PRED_CMD are
supposed to be passed through to L2 guests or not. While
msr_write_intercepted_l01 would return 'true' if it is not passed through.

So just invert the result of msr_write_intercepted_l01 to implement the
correct semantics.

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Jim Mattson <jmattson@google.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: kvm@vger.kernel.org
Cc: sironi@amazon.de
Fixes: 086e7d4118cc ("KVM: VMX: Allow direct access to MSR_IA32_SPEC_CTRL")
Link: http://lkml.kernel.org/r/1518305967-31356-5-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bee4c49f6dd08..599179bfb87fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10219,8 +10219,8 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 	 *    updated to reflect this when L1 (or its L2s) actually write to
 	 *    the MSR.
 	 */
-	bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
-	bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
+	bool pred_cmd = !msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+	bool spec_ctrl = !msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
 
 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
 	    !pred_cmd && !spec_ctrl)

From 3712caeb14dcb33fb4d5114f14c0beef10aca101 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Sat, 10 Feb 2018 23:39:26 +0000
Subject: [PATCH 534/676] KVM/nVMX: Set the CPU_BASED_USE_MSR_BITMAPS if we
 have a valid L02 MSR bitmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We either clear the CPU_BASED_USE_MSR_BITMAPS and end up intercepting all
MSR accesses or create a valid L02 MSR bitmap and use that. This decision
has to be made every time we evaluate whether we are going to generate the
L02 MSR bitmap.

Before commit:

  d28b387fb74d ("KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL")

... this was probably OK since the decision was always identical.

This is no longer the case now since the MSR bitmap might actually
change once we decide to not intercept SPEC_CTRL and PRED_CMD.

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: jmattson@google.com
Cc: kvm@vger.kernel.org
Cc: sironi@amazon.de
Link: http://lkml.kernel.org/r/1518305967-31356-6-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 599179bfb87fb..91e3539cba024 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10130,7 +10130,8 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 	if (cpu_has_vmx_msr_bitmap() &&
 	    nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
 	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-		;
+		vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+			      CPU_BASED_USE_MSR_BITMAPS);
 	else
 		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
 				CPU_BASED_USE_MSR_BITMAPS);

From 21e433bdb95bdf3aa48226fd3d33af608437f293 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 13 Feb 2018 09:03:08 +0100
Subject: [PATCH 535/676] x86/speculation: Clean up various Spectre related
 details

Harmonize all the Spectre messages so that a:

    dmesg | grep -i spectre

... gives us most Spectre related kernel boot messages.

Also fix a few other details:

 - clarify a comment about firmware speculation control

 - s/KPTI/PTI

 - remove various line-breaks that made the code uglier

Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/bugs.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 61152aa533772..4acf16a76d1e4 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -162,8 +162,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 	if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
 		return SPECTRE_V2_CMD_NONE;
 	else {
-		ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
-					  sizeof(arg));
+		ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
 		if (ret < 0)
 			return SPECTRE_V2_CMD_AUTO;
 
@@ -175,8 +174,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 		}
 
 		if (i >= ARRAY_SIZE(mitigation_options)) {
-			pr_err("unknown option (%s). Switching to AUTO select\n",
-			       mitigation_options[i].option);
+			pr_err("unknown option (%s). Switching to AUTO select\n", mitigation_options[i].option);
 			return SPECTRE_V2_CMD_AUTO;
 		}
 	}
@@ -185,8 +183,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 	     cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
 	     cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
 	    !IS_ENABLED(CONFIG_RETPOLINE)) {
-		pr_err("%s selected but not compiled in. Switching to AUTO select\n",
-		       mitigation_options[i].option);
+		pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option);
 		return SPECTRE_V2_CMD_AUTO;
 	}
 
@@ -256,14 +253,14 @@ static void __init spectre_v2_select_mitigation(void)
 			goto retpoline_auto;
 		break;
 	}
-	pr_err("kernel not compiled with retpoline; no mitigation available!");
+	pr_err("Spectre mitigation: kernel not compiled with retpoline; no mitigation available!");
 	return;
 
 retpoline_auto:
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 	retpoline_amd:
 		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
-			pr_err("LFENCE not serializing. Switching to generic retpoline\n");
+			pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
 			goto retpoline_generic;
 		}
 		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
@@ -281,7 +278,7 @@ static void __init spectre_v2_select_mitigation(void)
 	pr_info("%s\n", spectre_v2_strings[mode]);
 
 	/*
-	 * If neither SMEP or KPTI are available, there is a risk of
+	 * If neither SMEP nor PTI are available, there is a risk of
 	 * hitting userspace addresses in the RSB after a context switch
 	 * from a shallow call stack to a deeper one. To prevent this fill
 	 * the entire RSB, even when using IBRS.
@@ -295,21 +292,20 @@ static void __init spectre_v2_select_mitigation(void)
 	if ((!boot_cpu_has(X86_FEATURE_PTI) &&
 	     !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
 		setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
-		pr_info("Filling RSB on context switch\n");
+		pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
 	}
 
 	/* Initialize Indirect Branch Prediction Barrier if supported */
 	if (boot_cpu_has(X86_FEATURE_IBPB)) {
 		setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
-		pr_info("Enabling Indirect Branch Prediction Barrier\n");
+		pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
 	}
 }
 
 #undef pr_fmt
 
 #ifdef CONFIG_SYSFS
-ssize_t cpu_show_meltdown(struct device *dev,
-			  struct device_attribute *attr, char *buf)
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		return sprintf(buf, "Not affected\n");
@@ -318,16 +314,14 @@ ssize_t cpu_show_meltdown(struct device *dev,
 	return sprintf(buf, "Vulnerable\n");
 }
 
-ssize_t cpu_show_spectre_v1(struct device *dev,
-			    struct device_attribute *attr, char *buf)
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
 		return sprintf(buf, "Not affected\n");
 	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
 }
 
-ssize_t cpu_show_spectre_v2(struct device *dev,
-			    struct device_attribute *attr, char *buf)
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 		return sprintf(buf, "Not affected\n");

From 2e3f0098bc45f710a2f4cbcc94b80a1fae7a99a1 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:42 +0100
Subject: [PATCH 536/676] x86/entry/64: Merge SAVE_C_REGS and SAVE_EXTRA_REGS,
 remove unused extensions

All current code paths call SAVE_C_REGS and then immediately
SAVE_EXTRA_REGS. Therefore, merge these two macros and order the MOV
sequeneces properly.

While at it, remove the macros to save all except specific registers,
as these macros have been unused for a long time.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-2-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 57 +++++++++++----------------------------
 arch/x86/entry/entry_64.S | 12 +++------
 2 files changed, 19 insertions(+), 50 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4b129d4af42f..8907a6593b42a 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,49 +101,22 @@ For 32-bit we have the following conventions - kernel is built with
 	addq	$-(15*8), %rsp
 	.endm
 
-	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
-	.if \r11
-	movq %r11, 6*8+\offset(%rsp)
-	.endif
-	.if \r8910
-	movq %r10, 7*8+\offset(%rsp)
-	movq %r9,  8*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	.endif
-	.if \rax
-	movq %rax, 10*8+\offset(%rsp)
-	.endif
-	.if \rcx
-	movq %rcx, 11*8+\offset(%rsp)
-	.endif
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
+	.macro SAVE_REGS offset=0
 	movq %rdi, 14*8+\offset(%rsp)
-	UNWIND_HINT_REGS offset=\offset extra=0
-	.endm
-	.macro SAVE_C_REGS offset=0
-	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
-	SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_R891011
-	SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
-	SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
-	.endm
-	.macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
-	SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
-	.endm
-
-	.macro SAVE_EXTRA_REGS offset=0
-	movq %r15, 0*8+\offset(%rsp)
-	movq %r14, 1*8+\offset(%rsp)
-	movq %r13, 2*8+\offset(%rsp)
-	movq %r12, 3*8+\offset(%rsp)
-	movq %rbp, 4*8+\offset(%rsp)
+	movq %rsi, 13*8+\offset(%rsp)
+	movq %rdx, 12*8+\offset(%rsp)
+	movq %rcx, 11*8+\offset(%rsp)
+	movq %rax, 10*8+\offset(%rsp)
+	movq %r8,  9*8+\offset(%rsp)
+	movq %r9,  8*8+\offset(%rsp)
+	movq %r10, 7*8+\offset(%rsp)
+	movq %r11, 6*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
+	movq %rbp, 4*8+\offset(%rsp)
+	movq %r12, 3*8+\offset(%rsp)
+	movq %r13, 2*8+\offset(%rsp)
+	movq %r14, 1*8+\offset(%rsp)
+	movq %r15, 0*8+\offset(%rsp)
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
@@ -197,7 +170,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 932a445febee7..1a6fc0136225d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -573,8 +573,7 @@ END(irq_entries_start)
 1:
 
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS
-	SAVE_EXTRA_REGS
+	SAVE_REGS
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
@@ -1132,8 +1131,7 @@ ENTRY(xen_failsafe_callback)
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS
-	SAVE_EXTRA_REGS
+	SAVE_REGS
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
@@ -1178,8 +1176,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_C_REGS 8
-	SAVE_EXTRA_REGS 8
+	SAVE_REGS 8
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
@@ -1231,8 +1228,7 @@ END(paranoid_exit)
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_C_REGS 8
-	SAVE_EXTRA_REGS 8
+	SAVE_REGS 8
 	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)

From 502af0d70843c2a9405d7ba1f79b4b0305aaf5f5 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:43 +0100
Subject: [PATCH 537/676] x86/entry/64: Merge the POP_C_REGS and POP_EXTRA_REGS
 macros into a single POP_REGS macro

The two special, opencoded cases for POP_C_REGS can be handled by ASM
macros.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-3-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 15 +++++++++++----
 arch/x86/entry/entry_64.S | 26 ++++----------------------
 2 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 8907a6593b42a..3bda31736a7b4 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -139,25 +139,32 @@ For 32-bit we have the following conventions - kernel is built with
 	xorq %r15, %r15
 	.endm
 
-	.macro POP_EXTRA_REGS
+	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
 	popq %r13
 	popq %r12
 	popq %rbp
 	popq %rbx
-	.endm
-
-	.macro POP_C_REGS
+	.if \skip_r11rcx
+	popq %rsi
+	.else
 	popq %r11
+	.endif
 	popq %r10
 	popq %r9
 	popq %r8
 	popq %rax
+	.if \skip_r11rcx
+	popq %rsi
+	.else
 	popq %rcx
+	.endif
 	popq %rdx
 	popq %rsi
+	.if \pop_rdi
 	popq %rdi
+	.endif
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1a6fc0136225d..7351c91fb7df4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -334,15 +334,7 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	UNWIND_HINT_EMPTY
-	POP_EXTRA_REGS
-	popq	%rsi	/* skip r11 */
-	popq	%r10
-	popq	%r9
-	popq	%r8
-	popq	%rax
-	popq	%rsi	/* skip rcx */
-	popq	%rdx
-	popq	%rsi
+	POP_REGS pop_rdi=0 skip_r11rcx=1
 
 	/*
 	 * Now all regs are restored except RSP and RDI.
@@ -635,15 +627,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	ud2
 1:
 #endif
-	POP_EXTRA_REGS
-	popq	%r11
-	popq	%r10
-	popq	%r9
-	popq	%r8
-	popq	%rax
-	popq	%rcx
-	popq	%rdx
-	popq	%rsi
+	POP_REGS pop_rdi=0
 
 	/*
 	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
@@ -701,8 +685,7 @@ GLOBAL(restore_regs_and_return_to_kernel)
 	ud2
 1:
 #endif
-	POP_EXTRA_REGS
-	POP_C_REGS
+	POP_REGS
 	addq	$8, %rsp	/* skip regs->orig_ax */
 	INTERRUPT_RETURN
 
@@ -1661,8 +1644,7 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
-	POP_EXTRA_REGS
-	POP_C_REGS
+	POP_REGS
 
 	/*
 	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"

From f7bafa2b05ef25eda1d9179fd930b0330cf2b7d1 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:44 +0100
Subject: [PATCH 538/676] x86/entry/64: Interleave XOR register clearing with
 PUSH instructions

Same as is done for syscalls, interleave XOR with PUSH instructions
for exceptions/interrupts, in order to minimize the cost of the
additional instructions required for register clearing.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-4-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 40 +++++++++++++++++++--------------------
 arch/x86/entry/entry_64.S | 30 ++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3bda31736a7b4..a05cbb81268d9 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -101,44 +101,42 @@ For 32-bit we have the following conventions - kernel is built with
 	addq	$-(15*8), %rsp
 	.endm
 
-	.macro SAVE_REGS offset=0
+	.macro SAVE_AND_CLEAR_REGS offset=0
+	/*
+	 * Save registers and sanitize registers of values that a
+	 * speculation attack might otherwise want to exploit. The
+	 * lower registers are likely clobbered well before they
+	 * could be put to use in a speculative execution gadget.
+	 * Interleave XOR with PUSH for better uop scheduling:
+	 */
 	movq %rdi, 14*8+\offset(%rsp)
 	movq %rsi, 13*8+\offset(%rsp)
 	movq %rdx, 12*8+\offset(%rsp)
 	movq %rcx, 11*8+\offset(%rsp)
 	movq %rax, 10*8+\offset(%rsp)
 	movq %r8,  9*8+\offset(%rsp)
+	xorq %r8, %r8				/* nospec r8 */
 	movq %r9,  8*8+\offset(%rsp)
+	xorq %r9, %r9				/* nospec r9 */
 	movq %r10, 7*8+\offset(%rsp)
+	xorq %r10, %r10				/* nospec r10 */
 	movq %r11, 6*8+\offset(%rsp)
+	xorq %r11, %r11				/* nospec r11 */
 	movq %rbx, 5*8+\offset(%rsp)
+	xorl %ebx, %ebx				/* nospec rbx */
 	movq %rbp, 4*8+\offset(%rsp)
+	xorl %ebp, %ebp				/* nospec rbp */
 	movq %r12, 3*8+\offset(%rsp)
+	xorq %r12, %r12				/* nospec r12 */
 	movq %r13, 2*8+\offset(%rsp)
+	xorq %r13, %r13				/* nospec r13 */
 	movq %r14, 1*8+\offset(%rsp)
+	xorq %r14, %r14				/* nospec r14 */
 	movq %r15, 0*8+\offset(%rsp)
+	xorq %r15, %r15				/* nospec r15 */
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
-	/*
-	 * Sanitize registers of values that a speculation attack
-	 * might otherwise want to exploit. The lower registers are
-	 * likely clobbered well before they could be put to use in
-	 * a speculative execution gadget:
-	 */
-	.macro CLEAR_REGS_NOSPEC
-	xorl %ebp, %ebp
-	xorl %ebx, %ebx
-	xorq %r8, %r8
-	xorq %r9, %r9
-	xorq %r10, %r10
-	xorq %r11, %r11
-	xorq %r12, %r12
-	xorq %r13, %r13
-	xorq %r14, %r14
-	xorq %r15, %r15
-	.endm
-
 	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
@@ -177,7 +175,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_REGS because it corrupts
+ * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7351c91fb7df4..07692b44800de 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -565,8 +565,7 @@ END(irq_entries_start)
 1:
 
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_REGS
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1114,8 +1113,7 @@ ENTRY(xen_failsafe_callback)
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_REGS
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
@@ -1159,8 +1157,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_REGS 8
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS 8
 	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1211,8 +1208,7 @@ END(paranoid_exit)
 ENTRY(error_entry)
 	UNWIND_HINT_FUNC
 	cld
-	SAVE_REGS 8
-	CLEAR_REGS_NOSPEC
+	SAVE_AND_CLEAR_REGS 8
 	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1399,18 +1395,34 @@ ENTRY(nmi)
 	pushq   (%rdx)		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
 	pushq   %rax		/* pt_regs->ax */
+	/*
+	 * Sanitize registers of values that a speculation attack
+	 * might otherwise want to exploit. The lower registers are
+	 * likely clobbered well before they could be put to use in
+	 * a speculative execution gadget. Interleave XOR with PUSH
+	 * for better uop scheduling:
+	 */
 	pushq   %r8		/* pt_regs->r8 */
+	xorq    %r8, %r8	/* nospec   r8 */
 	pushq   %r9		/* pt_regs->r9 */
+	xorq    %r9, %r9	/* nospec   r9 */
 	pushq   %r10		/* pt_regs->r10 */
+	xorq    %r10, %r10	/* nospec   r10 */
 	pushq   %r11		/* pt_regs->r11 */
+	xorq    %r11, %r11	/* nospec   r11*/
 	pushq	%rbx		/* pt_regs->rbx */
+	xorl    %ebx, %ebx	/* nospec   rbx*/
 	pushq	%rbp		/* pt_regs->rbp */
+	xorl    %ebp, %ebp	/* nospec   rbp*/
 	pushq	%r12		/* pt_regs->r12 */
+	xorq    %r12, %r12	/* nospec   r12*/
 	pushq	%r13		/* pt_regs->r13 */
+	xorq    %r13, %r13	/* nospec   r13*/
 	pushq	%r14		/* pt_regs->r14 */
+	xorq    %r14, %r14	/* nospec   r14*/
 	pushq	%r15		/* pt_regs->r15 */
+	xorq    %r15, %r15	/* nospec   r15*/
 	UNWIND_HINT_REGS
-	CLEAR_REGS_NOSPEC
 	ENCODE_FRAME_POINTER
 
 	/*

From 3f01daecd545e818098d84fd1ad43e19a508d705 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:45 +0100
Subject: [PATCH 539/676] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro

Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before
SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS.
This macro uses PUSH instead of MOV and should therefore be faster, at
least on newer CPUs.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-5-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 36 ++++++++++++++++++++++++++++++++++++
 arch/x86/entry/entry_64.S |  6 ++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a05cbb81268d9..57b1b87a04f05 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
+	.macro PUSH_AND_CLEAR_REGS
+	/*
+	 * Push registers and sanitize registers of values that a
+	 * speculation attack might otherwise want to exploit. The
+	 * lower registers are likely clobbered well before they
+	 * could be put to use in a speculative execution gadget.
+	 * Interleave XOR with PUSH for better uop scheduling:
+	 */
+	pushq   %rdi		/* pt_regs->di */
+	pushq   %rsi		/* pt_regs->si */
+	pushq   %rdx		/* pt_regs->dx */
+	pushq   %rcx		/* pt_regs->cx */
+	pushq   %rax		/* pt_regs->ax */
+	pushq   %r8		/* pt_regs->r8 */
+	xorq    %r8, %r8	/* nospec   r8 */
+	pushq   %r9		/* pt_regs->r9 */
+	xorq    %r9, %r9	/* nospec   r9 */
+	pushq   %r10		/* pt_regs->r10 */
+	xorq    %r10, %r10	/* nospec   r10 */
+	pushq   %r11		/* pt_regs->r11 */
+	xorq    %r11, %r11	/* nospec   r11*/
+	pushq	%rbx		/* pt_regs->rbx */
+	xorl    %ebx, %ebx	/* nospec   rbx*/
+	pushq	%rbp		/* pt_regs->rbp */
+	xorl    %ebp, %ebp	/* nospec   rbp*/
+	pushq	%r12		/* pt_regs->r12 */
+	xorq    %r12, %r12	/* nospec   r12*/
+	pushq	%r13		/* pt_regs->r13 */
+	xorq    %r13, %r13	/* nospec   r13*/
+	pushq	%r14		/* pt_regs->r14 */
+	xorq    %r14, %r14	/* nospec   r14*/
+	pushq	%r15		/* pt_regs->r15 */
+	xorq    %r15, %r15	/* nospec   r15*/
+	UNWIND_HINT_REGS
+	.endm
+
 	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 07692b44800de..cf4a9ae558f3f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -564,8 +564,7 @@ END(irq_entries_start)
 	call	switch_to_thread_stack
 1:
 
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_AND_CLEAR_REGS
+	PUSH_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
@@ -1112,8 +1111,7 @@ ENTRY(xen_failsafe_callback)
 	addq	$0x30, %rsp
 	UNWIND_HINT_IRET_REGS
 	pushq	$-1 /* orig_ax = -1 => not a system call */
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_AND_CLEAR_REGS
+	PUSH_AND_CLEAR_REGS
 	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)

From 30907fd13bb593202574bb20af58d67c70a1ee14 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:46 +0100
Subject: [PATCH 540/676] x86/entry/64: Use PUSH_AND_CLEAN_REGS in more cases

entry_SYSCALL_64_after_hwframe() and nmi() can be converted to use
PUSH_AND_CLEAN_REGS instead of opencoded variants thereof. Due to
the interleaving, the additional XOR-based clearing of R8 and R9
in entry_SYSCALL_64_after_hwframe() should not have any noticeable
negative implications.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-6-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  |  6 ++--
 arch/x86/entry/entry_64.S | 65 ++-------------------------------------
 2 files changed, 6 insertions(+), 65 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 57b1b87a04f05..d6a97e2945ee7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,7 +137,7 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
-	.macro PUSH_AND_CLEAR_REGS
+	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
@@ -147,9 +147,9 @@ For 32-bit we have the following conventions - kernel is built with
 	 */
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
-	pushq   %rdx		/* pt_regs->dx */
+	pushq	\rdx		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
-	pushq   %rax		/* pt_regs->ax */
+	pushq   \rax		/* pt_regs->ax */
 	pushq   %r8		/* pt_regs->r8 */
 	xorq    %r8, %r8	/* nospec   r8 */
 	pushq   %r9		/* pt_regs->r9 */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cf4a9ae558f3f..b06a4b5864ba4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -227,35 +227,8 @@ ENTRY(entry_SYSCALL_64)
 	pushq	%rcx				/* pt_regs->ip */
 GLOBAL(entry_SYSCALL_64_after_hwframe)
 	pushq	%rax				/* pt_regs->orig_ax */
-	pushq	%rdi				/* pt_regs->di */
-	pushq	%rsi				/* pt_regs->si */
-	pushq	%rdx				/* pt_regs->dx */
-	pushq	%rcx				/* pt_regs->cx */
-	pushq	$-ENOSYS			/* pt_regs->ax */
-	pushq	%r8				/* pt_regs->r8 */
-	pushq	%r9				/* pt_regs->r9 */
-	pushq	%r10				/* pt_regs->r10 */
-	/*
-	 * Clear extra registers that a speculation attack might
-	 * otherwise want to exploit. Interleave XOR with PUSH
-	 * for better uop scheduling:
-	 */
-	xorq	%r10, %r10			/* nospec   r10 */
-	pushq	%r11				/* pt_regs->r11 */
-	xorq	%r11, %r11			/* nospec   r11 */
-	pushq	%rbx				/* pt_regs->rbx */
-	xorl	%ebx, %ebx			/* nospec   rbx */
-	pushq	%rbp				/* pt_regs->rbp */
-	xorl	%ebp, %ebp			/* nospec   rbp */
-	pushq	%r12				/* pt_regs->r12 */
-	xorq	%r12, %r12			/* nospec   r12 */
-	pushq	%r13				/* pt_regs->r13 */
-	xorq	%r13, %r13			/* nospec   r13 */
-	pushq	%r14				/* pt_regs->r14 */
-	xorq	%r14, %r14			/* nospec   r14 */
-	pushq	%r15				/* pt_regs->r15 */
-	xorq	%r15, %r15			/* nospec   r15 */
-	UNWIND_HINT_REGS
+
+	PUSH_AND_CLEAR_REGS rax=$-ENOSYS
 
 	TRACE_IRQS_OFF
 
@@ -1388,39 +1361,7 @@ ENTRY(nmi)
 	pushq	1*8(%rdx)	/* pt_regs->rip */
 	UNWIND_HINT_IRET_REGS
 	pushq   $-1		/* pt_regs->orig_ax */
-	pushq   %rdi		/* pt_regs->di */
-	pushq   %rsi		/* pt_regs->si */
-	pushq   (%rdx)		/* pt_regs->dx */
-	pushq   %rcx		/* pt_regs->cx */
-	pushq   %rax		/* pt_regs->ax */
-	/*
-	 * Sanitize registers of values that a speculation attack
-	 * might otherwise want to exploit. The lower registers are
-	 * likely clobbered well before they could be put to use in
-	 * a speculative execution gadget. Interleave XOR with PUSH
-	 * for better uop scheduling:
-	 */
-	pushq   %r8		/* pt_regs->r8 */
-	xorq    %r8, %r8	/* nospec   r8 */
-	pushq   %r9		/* pt_regs->r9 */
-	xorq    %r9, %r9	/* nospec   r9 */
-	pushq   %r10		/* pt_regs->r10 */
-	xorq    %r10, %r10	/* nospec   r10 */
-	pushq   %r11		/* pt_regs->r11 */
-	xorq    %r11, %r11	/* nospec   r11*/
-	pushq	%rbx		/* pt_regs->rbx */
-	xorl    %ebx, %ebx	/* nospec   rbx*/
-	pushq	%rbp		/* pt_regs->rbp */
-	xorl    %ebp, %ebp	/* nospec   rbp*/
-	pushq	%r12		/* pt_regs->r12 */
-	xorq    %r12, %r12	/* nospec   r12*/
-	pushq	%r13		/* pt_regs->r13 */
-	xorq    %r13, %r13	/* nospec   r13*/
-	pushq	%r14		/* pt_regs->r14 */
-	xorq    %r14, %r14	/* nospec   r14*/
-	pushq	%r15		/* pt_regs->r15 */
-	xorq    %r15, %r15	/* nospec   r15*/
-	UNWIND_HINT_REGS
+	PUSH_AND_CLEAR_REGS rdx=(%rdx)
 	ENCODE_FRAME_POINTER
 
 	/*

From dde3036d62ba3375840b10ab9ec0d568fd773b07 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:47 +0100
Subject: [PATCH 541/676] x86/entry/64: Get rid of the ALLOC_PT_GPREGS_ON_STACK
 and SAVE_AND_CLEAR_REGS macros

Previously, error_entry() and paranoid_entry() saved the GP registers
onto stack space previously allocated by its callers. Combine these two
steps in the callers, and use the generic PUSH_AND_CLEAR_REGS macro
for that.

This adds a significant amount ot text size. However, Ingo Molnar points
out that:

	"these numbers also _very_ significantly over-represent the
	extra footprint. The assumptions that resulted in
	us compressing the IRQ entry code have changed very
	significantly with the new x86 IRQ allocation code we
	introduced in the last year:

	- IRQ vectors are usually populated in tightly clustered
	  groups.

	  With our new vector allocator code the typical per CPU
	  allocation percentage on x86 systems is ~3 device vectors
	  and ~10 fixed vectors out of ~220 vectors - i.e. a very
	  low ~6% utilization (!). [...]

	  The days where we allocated a lot of vectors on every
	  CPU and the compression of the IRQ entry code text
	  mattered are over.

	- Another issue is that only a small minority of vectors
	  is frequent enough to actually matter to cache utilization
	  in practice: 3-4 key IPIs and 1-2 device IRQs at most - and
	  those vectors tend to be tightly clustered as well into about
	  two groups, and are probably already on 2-3 cache lines in
	  practice.

	  For the common case of 'cache cold' IRQs it's the depth of
	  the call chain and the fragmentation of the resulting I$
	  that should be the main performance limit - not the overall
	  size of it.

	- The CPU side cost of IRQ delivery is still very expensive
	  even in the best, most cached case, as in 'over a thousand
	  cycles'. So much stuff is done that maybe contemporary x86
	  IRQ entry microcode already prefetches the IDT entry and its
	  expected call target address."[*]

[*] http://lkml.kernel.org/r/20180208094710.qnjixhm6hybebdv7@gmail.com

The "testb $3, CS(%rsp)" instruction in the idtentry macro does not need
modification. Previously, %rsp was manually decreased by 15*8; with
this patch, %rsp is decreased by 15 pushq instructions.

[jpoimboe@redhat.com: unwind hint improvements]

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-7-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h  | 42 +--------------------------------------
 arch/x86/entry/entry_64.S | 20 +++++++++----------
 2 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d6a97e2945ee7..59675010c9a06 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,46 +97,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro ALLOC_PT_GPREGS_ON_STACK
-	addq	$-(15*8), %rsp
-	.endm
-
-	.macro SAVE_AND_CLEAR_REGS offset=0
-	/*
-	 * Save registers and sanitize registers of values that a
-	 * speculation attack might otherwise want to exploit. The
-	 * lower registers are likely clobbered well before they
-	 * could be put to use in a speculative execution gadget.
-	 * Interleave XOR with PUSH for better uop scheduling:
-	 */
-	movq %rdi, 14*8+\offset(%rsp)
-	movq %rsi, 13*8+\offset(%rsp)
-	movq %rdx, 12*8+\offset(%rsp)
-	movq %rcx, 11*8+\offset(%rsp)
-	movq %rax, 10*8+\offset(%rsp)
-	movq %r8,  9*8+\offset(%rsp)
-	xorq %r8, %r8				/* nospec r8 */
-	movq %r9,  8*8+\offset(%rsp)
-	xorq %r9, %r9				/* nospec r9 */
-	movq %r10, 7*8+\offset(%rsp)
-	xorq %r10, %r10				/* nospec r10 */
-	movq %r11, 6*8+\offset(%rsp)
-	xorq %r11, %r11				/* nospec r11 */
-	movq %rbx, 5*8+\offset(%rsp)
-	xorl %ebx, %ebx				/* nospec rbx */
-	movq %rbp, 4*8+\offset(%rsp)
-	xorl %ebp, %ebp				/* nospec rbp */
-	movq %r12, 3*8+\offset(%rsp)
-	xorq %r12, %r12				/* nospec r12 */
-	movq %r13, 2*8+\offset(%rsp)
-	xorq %r13, %r13				/* nospec r13 */
-	movq %r14, 1*8+\offset(%rsp)
-	xorq %r14, %r14				/* nospec r14 */
-	movq %r15, 0*8+\offset(%rsp)
-	xorq %r15, %r15				/* nospec r15 */
-	UNWIND_HINT_REGS offset=\offset
-	.endm
-
 	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
@@ -211,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
  * is just setting the LSB, which makes it an invalid stack address and is also
  * a signal to the unwinder that it's a pt_regs pointer in disguise.
  *
- * NOTE: This macro must be used *after* SAVE_AND_CLEAR_REGS because it corrupts
+ * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts
  * the original rbp.
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b06a4b5864ba4..cfbf433667314 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -871,7 +871,9 @@ ENTRY(\sym)
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	ALLOC_PT_GPREGS_ON_STACK
+	/* Save all registers in pt_regs */
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	.if \paranoid < 2
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
@@ -1121,15 +1123,12 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	UNWIND_HINT_FUNC
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
 	rdmsr
@@ -1142,7 +1141,7 @@ ENTRY(paranoid_entry)
 	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
 
 	ret
-END(paranoid_entry)
+ENDPROC(paranoid_entry)
 
 /*
  * "Paranoid" exit path from exception stack.  This is invoked
@@ -1173,14 +1172,12 @@ ENTRY(paranoid_exit)
 END(paranoid_exit)
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
+ * Switch gs if needed.
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
-	UNWIND_HINT_FUNC
+	UNWIND_HINT_REGS offset=8
 	cld
-	SAVE_AND_CLEAR_REGS 8
-	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 
@@ -1571,7 +1568,8 @@ end_repeat_nmi:
 	 * frame to point back to repeat_nmi.
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-	ALLOC_PT_GPREGS_ON_STACK
+	PUSH_AND_CLEAR_REGS
+	ENCODE_FRAME_POINTER
 
 	/*
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

From 92816f571af81e9a71cc6f3dc8ce1e2fcdf7b6b8 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 11:49:48 +0100
Subject: [PATCH 542/676] x86/entry/64: Indent PUSH_AND_CLEAR_REGS and POP_REGS
 properly

... same as the other macros in arch/x86/entry/calling.h

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-8-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 59675010c9a06..6985440c68fa7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -97,7 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 #define SIZEOF_PTREGS	21*8
 
-	.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
@@ -131,9 +131,9 @@ For 32-bit we have the following conventions - kernel is built with
 	pushq	%r15		/* pt_regs->r15 */
 	xorq    %r15, %r15	/* nospec   r15*/
 	UNWIND_HINT_REGS
-	.endm
+.endm
 
-	.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1 skip_r11rcx=0
 	popq %r15
 	popq %r14
 	popq %r13
@@ -163,7 +163,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro icebp
 	.byte 0xf1
-	.endm
+.endm
 
 /*
  * This is a sneaky trick to help the unwinder find pt_regs on the stack.  The

From b3ccefaed922529e6a67de7b30af5aa38c76ace9 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 12 Feb 2018 11:45:03 -0600
Subject: [PATCH 543/676] x86/entry/64: Fix paranoid_entry() frame pointer
 warning

With the following commit:

  f09d160992d1 ("x86/entry/64: Get rid of the ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS macros")

... one of my suggested improvements triggered a frame pointer warning:

  arch/x86/entry/entry_64.o: warning: objtool: paranoid_entry()+0x11: call without frame pointer save/setup

The warning is correct for the build-time code, but it's actually not
relevant at runtime because of paravirt patching.  The paravirt swapgs
call gets replaced with either a SWAPGS instruction or NOPs at runtime.

Go back to the previous behavior by removing the ELF function annotation
for paranoid_entry() and adding an unwind hint, which effectively
silences the warning.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kbuild-all@01.org
Cc: tipbuild@zytor.com
Fixes: f09d160992d1 ("x86/entry/64: Get rid of the ALLOC_PT_GPREGS_ON_STACK and SAVE_AND_CLEAR_REGS macros")
Link: http://lkml.kernel.org/r/20180212174503.5acbymg5z6p32snu@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cfbf433667314..1c54204207d8d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1128,6 +1128,7 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
+	UNWIND_HINT_FUNC
 	cld
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
@@ -1141,7 +1142,7 @@ ENTRY(paranoid_entry)
 	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
 
 	ret
-ENDPROC(paranoid_entry)
+END(paranoid_entry)
 
 /*
  * "Paranoid" exit path from exception stack.  This is invoked

From b498c261107461d5c42140dfddd05df83d8ca078 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 12 Feb 2018 21:13:18 +0100
Subject: [PATCH 544/676] x86/entry/64: Remove the unused 'icebp' macro

That macro was touched around 2.5.8 times, judging by the full history
linux repo, but it was unused even then. Get rid of it already.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux@dominikbrodowski.net
Link: http://lkml.kernel.org/r/20180212201318.GD14640@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/calling.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 6985440c68fa7..dce7092ab24a2 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -159,10 +159,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.if \pop_rdi
 	popq %rdi
 	.endif
-	.endm
-
-	.macro icebp
-	.byte 0xf1
 .endm
 
 /*

From 198ee8e17502da2634f7366395db1d77630e0219 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 12:10:10 +0100
Subject: [PATCH 545/676] selftests/x86: Fix vDSO selftest segfault for
 vsyscall=none

The vDSO selftest tries to execute a vsyscall unconditionally, even if it
is not present on the test system (e.g. if booted with vsyscall=none or
with CONFIG_LEGACY_VSYSCALL_NONE=y set. Fix this by copying (and tweaking)
the vsyscall check from test_vsyscall.c

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180211111013.16888-3-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/test_vdso.c | 50 +++++++++++++++++++++----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c
index 29973cde06d3d..558c8207e7b9c 100644
--- a/tools/testing/selftests/x86/test_vdso.c
+++ b/tools/testing/selftests/x86/test_vdso.c
@@ -28,18 +28,52 @@
 
 int nerrs = 0;
 
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+
+getcpu_t vgetcpu;
+getcpu_t vdso_getcpu;
+
+static void *vsyscall_getcpu(void)
+{
 #ifdef __x86_64__
-# define VSYS(x) (x)
+	FILE *maps;
+	char line[128];
+	bool found = false;
+
+	maps = fopen("/proc/self/maps", "r");
+	if (!maps) /* might still be present, but ignore it here, as we test vDSO not vsyscall */
+		return NULL;
+
+	while (fgets(line, sizeof(line), maps)) {
+		char r, x;
+		void *start, *end;
+		char name[128];
+		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+			   &start, &end, &r, &x, name) != 5)
+			continue;
+
+		if (strcmp(name, "[vsyscall]"))
+			continue;
+
+		/* assume entries are OK, as we test vDSO here not vsyscall */
+		found = true;
+		break;
+	}
+
+	fclose(maps);
+
+	if (!found) {
+		printf("Warning: failed to find vsyscall getcpu\n");
+		return NULL;
+	}
+	return (void *) (0xffffffffff600800);
 #else
-# define VSYS(x) 0
+	return NULL;
 #endif
+}
 
-typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
-
-const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
-getcpu_t vdso_getcpu;
 
-void fill_function_pointers()
+static void fill_function_pointers()
 {
 	void *vdso = dlopen("linux-vdso.so.1",
 			    RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
@@ -54,6 +88,8 @@ void fill_function_pointers()
 	vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
 	if (!vdso_getcpu)
 		printf("Warning: failed to find getcpu in vDSO\n");
+
+	vgetcpu = (getcpu_t) vsyscall_getcpu();
 }
 
 static long sys_getcpu(unsigned * cpu, unsigned * node,

From d8e92de8ef952bed88c56c7a44c02d8dcae0984e Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 21:59:24 +0100
Subject: [PATCH 546/676] selftests/x86: Clean up and document sscanf() usage

Replace a couple of magically connected buffer length literal constants with
a common definition that makes their relationship obvious. Also document
why our sscanf() usage is safe.

No intended functional changes.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180211205924.GA23210@light.dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/test_vdso.c     | 11 ++++++++---
 tools/testing/selftests/x86/test_vsyscall.c | 11 ++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c
index 558c8207e7b9c..2352590117042 100644
--- a/tools/testing/selftests/x86/test_vdso.c
+++ b/tools/testing/selftests/x86/test_vdso.c
@@ -26,6 +26,9 @@
 # endif
 #endif
 
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
 int nerrs = 0;
 
 typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
@@ -37,17 +40,19 @@ static void *vsyscall_getcpu(void)
 {
 #ifdef __x86_64__
 	FILE *maps;
-	char line[128];
+	char line[MAPS_LINE_LEN];
 	bool found = false;
 
 	maps = fopen("/proc/self/maps", "r");
 	if (!maps) /* might still be present, but ignore it here, as we test vDSO not vsyscall */
 		return NULL;
 
-	while (fgets(line, sizeof(line), maps)) {
+	while (fgets(line, MAPS_LINE_LEN, maps)) {
 		char r, x;
 		void *start, *end;
-		char name[128];
+		char name[MAPS_LINE_LEN];
+
+		/* sscanf() is safe here as strlen(name) >= strlen(line) */
 		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
 			   &start, &end, &r, &x, name) != 5)
 			continue;
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
index 7a744fa7b7865..be81621446f01 100644
--- a/tools/testing/selftests/x86/test_vsyscall.c
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -33,6 +33,9 @@
 # endif
 #endif
 
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
 static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
 		       int flags)
 {
@@ -98,7 +101,7 @@ static int init_vsys(void)
 #ifdef __x86_64__
 	int nerrs = 0;
 	FILE *maps;
-	char line[128];
+	char line[MAPS_LINE_LEN];
 	bool found = false;
 
 	maps = fopen("/proc/self/maps", "r");
@@ -108,10 +111,12 @@ static int init_vsys(void)
 		return 0;
 	}
 
-	while (fgets(line, sizeof(line), maps)) {
+	while (fgets(line, MAPS_LINE_LEN, maps)) {
 		char r, x;
 		void *start, *end;
-		char name[128];
+		char name[MAPS_LINE_LEN];
+
+		/* sscanf() is safe here as strlen(name) >= strlen(line) */
 		if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
 			   &start, &end, &r, &x, name) != 5)
 			continue;

From ce676638fe7b284132a7d7d5e7e7ad81bab9947e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 13 Feb 2018 08:26:17 +0100
Subject: [PATCH 547/676] selftests/x86/pkeys: Remove unused functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This also gets rid of two build warnings:

  protection_keys.c: In function ‘dumpit’:
  protection_keys.c:419:3: warning: ignoring return value of ‘write’, declared with attribute warn_unused_result [-Wunused-result]
     write(1, buf, nr_read);
     ^~~~~~~~~~~~~~~~~~~~~~

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/protection_keys.c | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index bc1b0735bb50e..f15aa5a76fe34 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -393,34 +393,6 @@ pid_t fork_lazy_child(void)
 	return forkret;
 }
 
-void davecmp(void *_a, void *_b, int len)
-{
-	int i;
-	unsigned long *a = _a;
-	unsigned long *b = _b;
-
-	for (i = 0; i < len / sizeof(*a); i++) {
-		if (a[i] == b[i])
-			continue;
-
-		dprintf3("[%3d]: a: %016lx b: %016lx\n", i, a[i], b[i]);
-	}
-}
-
-void dumpit(char *f)
-{
-	int fd = open(f, O_RDONLY);
-	char buf[100];
-	int nr_read;
-
-	dprintf2("maps fd: %d\n", fd);
-	do {
-		nr_read = read(fd, &buf[0], sizeof(buf));
-		write(1, buf, nr_read);
-	} while (nr_read > 0);
-	close(fd);
-}
-
 #define PKEY_DISABLE_ACCESS    0x1
 #define PKEY_DISABLE_WRITE     0x2
 

From 7f95122067ab26fb8344b2a9de64ffbd0fea0010 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 12:10:09 +0100
Subject: [PATCH 548/676] selftests/x86: Fix build bug caused by the 5lvl test
 which has been moved to the VM directory

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Fixes: 235266b8e11c "selftests/vm: move 128TB mmap boundary test to generic directory"
Link: http://lkml.kernel.org/r/20180211111013.16888-2-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 5d4f10ac2af22..91fbfa8fdc150 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -11,7 +11,7 @@ TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_sysc
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
-TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip 5lvl
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)

From 2cbc0d66de0480449c75636f55697c7ff3af61fc Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 11 Feb 2018 12:10:11 +0100
Subject: [PATCH 549/676] selftests/x86: Do not rely on "int $0x80" in
 test_mremap_vdso.c

On 64-bit builds, we should not rely on "int $0x80" working (it only does if
CONFIG_IA32_EMULATION=y is enabled).

Without this patch, the move test may succeed, but the "int $0x80" causes
a segfault, resulting in a false negative output of this self-test.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180211111013.16888-4-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/test_mremap_vdso.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
index bf0d687c7db75..64f11c8d9b767 100644
--- a/tools/testing/selftests/x86/test_mremap_vdso.c
+++ b/tools/testing/selftests/x86/test_mremap_vdso.c
@@ -90,8 +90,12 @@ int main(int argc, char **argv, char **envp)
 			vdso_size += PAGE_SIZE;
 		}
 
+#ifdef __i386__
 		/* Glibc is likely to explode now - exit with raw syscall */
 		asm volatile ("int $0x80" : : "a" (__NR_exit), "b" (!!ret));
+#else /* __x86_64__ */
+		syscall(SYS_exit, ret);
+#endif
 	} else {
 		int status;
 

From 595dd46ebfc10be041a365d0a3fa99df50b6ba73 Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia@linux.alibaba.com>
Date: Mon, 12 Feb 2018 22:44:53 +0800
Subject: [PATCH 550/676] vfs/proc/kcore, x86/mm/kcore: Fix SMAP fault when
 dumping vsyscall user page

Commit:

  df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data")

... introduced a bounce buffer to work around CONFIG_HARDENED_USERCOPY=y.
However, accessing the vsyscall user page will cause an SMAP fault.

Replace memcpy() with copy_from_user() to fix this bug works, but adding
a common way to handle this sort of user page may be useful for future.

Currently, only vsyscall page requires KCORE_USER.

Signed-off-by: Jia Zhang <zhang.jia@linux.alibaba.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1518446694-21124-2-git-send-email-zhang.jia@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init_64.c | 3 +--
 fs/proc/kcore.c       | 4 ++++
 include/linux/kcore.h | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1ab42c8520693..6aa33d1e198f2 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1193,8 +1193,7 @@ void __init mem_init(void)
 	register_page_bootmem_info();
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
-			 PAGE_SIZE, KCORE_OTHER);
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
 
 	mem_init_print_info(NULL);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e8a93bc8285d8..d1e82761de813 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -510,6 +510,10 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			/* we have to zero-fill user buffer even if no read */
 			if (copy_to_user(buffer, buf, tsz))
 				return -EFAULT;
+		} else if (m->type == KCORE_USER) {
+			/* User page is handled prior to normal kernel page: */
+			if (copy_to_user(buffer, (char *)start, tsz))
+				return -EFAULT;
 		} else {
 			if (kern_addr_valid(start)) {
 				/*
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 7ff25a808feff..80db19d3a5054 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -10,6 +10,7 @@ enum kcore_type {
 	KCORE_VMALLOC,
 	KCORE_RAM,
 	KCORE_VMEMMAP,
+	KCORE_USER,
 	KCORE_OTHER,
 };
 

From cd026ca2861e7f384d677626a483da797c76b9da Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia@linux.alibaba.com>
Date: Mon, 12 Feb 2018 22:44:54 +0800
Subject: [PATCH 551/676] x86/mm/kcore: Add vsyscall page to /proc/kcore
 conditionally

The vsyscall page should be visible only if vsyscall=emulate/native when dumping /proc/kcore.

Signed-off-by: Jia Zhang <zhang.jia@linux.alibaba.com>
Reviewed-by: Jiri Olsa <jolsa@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1518446694-21124-3-git-send-email-zhang.jia@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6aa33d1e198f2..8ba9c31289473 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1193,7 +1193,8 @@ void __init mem_init(void)
 	register_page_bootmem_info();
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
+	if (get_gate_vma(&init_mm))
+		kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
 
 	mem_init_print_info(NULL);
 }

From 6fe0ce1eb04f99a1eb1eb6e7f775666966cf6c80 Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang99@zte.com.cn>
Date: Tue, 6 Feb 2018 09:55:48 +0800
Subject: [PATCH 552/676] sched/deadline: Make update_curr_dl() more accurate

rq->clock_task may be updated between the two calls of
rq_clock_task() in update_curr_dl(). Calling rq_clock_task() only
once makes it more accurate and efficient, taking update_curr() as
reference.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiang Biao <jiang.biao2@zte.com.cn>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: zhong.weidong@zte.com.cn
Link: http://lkml.kernel.org/r/1517882148-44599-1-git-send-email-wen.yang99@zte.com.cn
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9bb0e0c412ec6..9df09782025cb 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq)
 	struct sched_dl_entity *dl_se = &curr->dl;
 	u64 delta_exec, scaled_delta_exec;
 	int cpu = cpu_of(rq);
+	u64 now;
 
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
@@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq)
 	 * natural solution, but the full ramifications of this
 	 * approach need further study.
 	 */
-	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+	now = rq_clock_task(rq);
+	delta_exec = now - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0)) {
 		if (unlikely(dl_se->dl_yielded))
 			goto throttle;
@@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq)
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
 
-	curr->se.exec_start = rq_clock_task(rq);
+	curr->se.exec_start = now;
 	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);

From a7711602c7b79950ea437178f601b52ab08ef659 Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang99@zte.com.cn>
Date: Tue, 6 Feb 2018 09:53:28 +0800
Subject: [PATCH 553/676] sched/rt: Make update_curr_rt() more accurate

rq->clock_task may be updated between the two calls of
rq_clock_task() in update_curr_rt(). Calling rq_clock_task() only
once makes it more accurate and efficient, taking update_curr() as
reference.

Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Jiang Biao <jiang.biao2@zte.com.cn>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: zhong.weidong@zte.com.cn
Link: http://lkml.kernel.org/r/1517882008-44552-1-git-send-email-wen.yang99@zte.com.cn
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 663b2355a3aa7..aad49451584e6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_rt_entity *rt_se = &curr->rt;
-	u64 now = rq_clock_task(rq);
 	u64 delta_exec;
+	u64 now;
 
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
+	now = rq_clock_task(rq);
 	delta_exec = now - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;

From 269d599271fa604f09d5cb0093c5dd5d59964dd5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 6 Feb 2018 17:52:13 +0100
Subject: [PATCH 554/676] sched/core: Fix DEBUG_SPINLOCK annotation for
 rq->lock

Mark noticed that he had sporadic "spinlock recursion" warnings from
the DEBUG_SPINLOCK code. Now rq->lock is special in that the owner
changes in the middle of a context switch.

It so happens that we fix up the lock.owner too late, @prev can run
(remotely) the moment prev->on_cpu is cleared, this then allows @prev
to again try and acquire this rq->lock and trigger this warning.

So we have to switch lock.owner before clearing prev->on_cpu.

Do this by moving the DEBUG_SPINLOCK annotation from after switch_to()
to before switch_to() and collect all lockdep annotations there into
prepare_lock_switch() to mirror the existing finish_lock_switch().

Debugged-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf724c1952eac..e7c535eee0a6d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev)
 #endif
 }
 
-static inline void finish_lock_switch(struct rq *rq)
+static inline void
+prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+	rq_unpin_lock(rq, rf);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #ifdef CONFIG_DEBUG_SPINLOCK
 	/* this is a valid case when another task releases the spinlock */
-	rq->lock.owner = current;
+	rq->lock.owner = next;
 #endif
+}
+
+static inline void finish_lock_switch(struct rq *rq)
+{
 	/*
 	 * If we are tracking spinlock dependencies then we have to
 	 * fix up the runqueue lock - which gets 'carried over' from
 	 * prev into current:
 	 */
 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
 	raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
-	/*
-	 * Since the runqueue lock will be released by the next
-	 * task (which is an invalid locking op but in the case
-	 * of the scheduler it's an obvious special-case), so we
-	 * do an early lockdep release here:
-	 */
-	rq_unpin_lock(rq, rf);
-	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	prepare_lock_switch(rq, next, rf);
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);

From c9dccf1d074a67d36c510845f663980d69e3409b Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sam.bobroff@au1.ibm.com>
Date: Mon, 12 Feb 2018 11:19:29 +1100
Subject: [PATCH 555/676] powerpc/pseries: Enable RAS hotplug events later

Currently if the kernel receives a memory hot-unplug event early
enough, it may get stuck in an infinite loop in
dissolve_free_huge_pages(). This appears as a stall just after:

  pseries-hotplug-mem: Attempting to hot-remove XX LMB(s) at YYYYYYYY

It appears to be caused by "minimum_order" being uninitialized, due to
init_ras_IRQ() executing before hugetlb_init().

To correct this, extract the part of init_ras_IRQ() that enables
hotplug event processing and place it in the machine_late_initcall
phase, which is guaranteed to be after hugetlb_init() is called.

Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
[mpe: Reorder the functions to make the diff readable]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/ras.c | 31 ++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 81d8614e73790..5e1ef91501820 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -48,6 +48,28 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
 
 
+/*
+ * Enable the hotplug interrupt late because processing them may touch other
+ * devices or systems (e.g. hugepages) that have not been initialized at the
+ * subsys stage.
+ */
+int __init init_ras_hotplug_IRQ(void)
+{
+	struct device_node *np;
+
+	/* Hotplug Events */
+	np = of_find_node_by_path("/event-sources/hot-plug-events");
+	if (np != NULL) {
+		if (dlpar_workqueue_init() == 0)
+			request_event_sources_irqs(np, ras_hotplug_interrupt,
+						   "RAS_HOTPLUG");
+		of_node_put(np);
+	}
+
+	return 0;
+}
+machine_late_initcall(pseries, init_ras_hotplug_IRQ);
+
 /*
  * Initialize handlers for the set of interrupts caused by hardware errors
  * and power system events.
@@ -66,15 +88,6 @@ static int __init init_ras_IRQ(void)
 		of_node_put(np);
 	}
 
-	/* Hotplug Events */
-	np = of_find_node_by_path("/event-sources/hot-plug-events");
-	if (np != NULL) {
-		if (dlpar_workqueue_init() == 0)
-			request_event_sources_irqs(np, ras_hotplug_interrupt,
-					   "RAS_HOTPLUG");
-		of_node_put(np);
-	}
-
 	/* EPOW Events */
 	np = of_find_node_by_path("/event-sources/epow-events");
 	if (np != NULL) {

From b00b62898631b756c3e123542bbb0487aa343dd9 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Thu, 8 Feb 2018 19:18:38 +1000
Subject: [PATCH 556/676] powerpc/vas: Don't set uses_vas for kernel windows

cp_abort is only required for user windows, because kernel context
must not be preempted between a copy/paste pair.

Without this patch, the init task gets used_vas set when it runs the
nx842_powernv_init initcall, which opens windows for kernel usage.

used_vas is then never cleared anywhere, so it gets propagated into
all other tasks. It's a property of the address space, so it should
really be cleared when a new mm is created (or in dup_mmap if the
mmaps are marked as VM_DONTCOPY). For now we seem to have no such
driver, so leave that for another patch.

Fixes: 6c8e6bb2a52d ("powerpc/vas: Add support for user receive window")
Cc: stable@vger.kernel.org # v4.15+
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 2b3eb01ab1107..b7c53a51c31bb 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1063,16 +1063,16 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 			rc = PTR_ERR(txwin->paste_kaddr);
 			goto free_window;
 		}
+	} else {
+		/*
+		 * A user mapping must ensure that context switch issues
+		 * CP_ABORT for this thread.
+		 */
+		rc = set_thread_uses_vas();
+		if (rc)
+			goto free_window;
 	}
 
-	/*
-	 * Now that we have a send window, ensure context switch issues
-	 * CP_ABORT for this thread.
-	 */
-	rc = -EINVAL;
-	if (set_thread_uses_vas() < 0)
-		goto free_window;
-
 	set_vinst_win(vinst, txwin);
 
 	return txwin;

From 62e984ddfd6b056d399e24113f5e6a7145e579d8 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 1 Feb 2018 16:09:44 +1100
Subject: [PATCH 557/676] powerpc/mm: Flush radix process translations when
 setting MMU type

Radix guests do normally invalidate process-scoped translations when a
new pid is allocated but migrated guests do not invalidate these so
migrated guests crash sometime, especially easy to reproduce with
migration happening within first 10 seconds after the guest boot start
on the same machine.

This adds the "Invalidate process-scoped translations" flush to fix
radix guests migration.

Fixes: 2ee13be34b13 ("KVM: PPC: Book3S HV: Update kvmppc_set_arch_compat() for ISA v3.00")
Cc: stable@vger.kernel.org # v4.10+
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Tested-by: Laurent Vivier <lvivier@redhat.com>
Tested-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable_64.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index c9a623c2d8a27..d75dd5273d15f 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -471,6 +471,8 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
 	if (old & PATB_HR) {
 		asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
 			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+		asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
 		trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
 	} else {
 		asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :

From fae2211697c9490414e974431051f7fed5506653 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 11 Feb 2018 20:30:06 +0530
Subject: [PATCH 558/676] powerpc/mm: Fix crashes with 16G huge pages

To support memory keys, we moved the hash pte slot information to the
second half of the page table. This was ok with PTE entries at level
4 (PTE page) and level 3 (PMD). We already allocate larger page table
pages at those levels to accomodate extra details. For level 4 we
already have the extra space which was used to track 4k hash page
table entry details and at level 3 the extra space was allocated to
track the THP details.

With hugetlbfs PTE, we used this extra space at the PMD level to store
the slot details. But we also support hugetlbfs PTE at PUD level for
16GB pages and PUD level page didn't allocate extra space. This
resulted in memory corruption.

Fix this by allocating extra space at PUD level when HUGETLB is
enabled.

Fixes: bf9a95f9a648 ("powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/32/pgtable.h  |  1 +
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  5 +++++
 arch/powerpc/include/asm/book3s/64/hash.h     | 10 ++++++++++
 arch/powerpc/include/asm/book3s/64/pgalloc.h  |  6 +++---
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  2 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h  |  1 +
 arch/powerpc/include/asm/nohash/64/pgtable.h  |  1 +
 arch/powerpc/mm/hash_utils_64.c               |  1 +
 arch/powerpc/mm/init-common.c                 |  4 ++--
 arch/powerpc/mm/pgtable-radix.c               |  1 +
 arch/powerpc/mm/pgtable_64.c                  |  2 ++
 11 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 30a155c0a6b07..c615abdce119e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -16,6 +16,7 @@
 #define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
 
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 338b7da468cef..c08b3b032ec0a 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -146,7 +146,12 @@ static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long a
 #else
 #define H_PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #endif
+#ifdef CONFIG_HUGETLB_PAGE
+#define H_PUD_TABLE_SIZE	((sizeof(pud_t) << PUD_INDEX_SIZE) +	\
+				 (sizeof(unsigned long) << PUD_INDEX_SIZE))
+#else
 #define H_PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#endif
 #define H_PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 0920eff731b38..a889457542e86 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -32,6 +32,16 @@
 #else
 #define H_PMD_CACHE_INDEX	H_PMD_INDEX_SIZE
 #endif
+/*
+ * We store the slot details in the second half of page table.
+ * Increase the pud level table so that hugetlb ptes can be stored
+ * at pud level.
+ */
+#if defined(CONFIG_HUGETLB_PAGE) &&  defined(CONFIG_PPC_64K_PAGES)
+#define H_PUD_CACHE_INDEX	(H_PUD_INDEX_SIZE + 1)
+#else
+#define H_PUD_CACHE_INDEX	(H_PUD_INDEX_SIZE)
+#endif
 /*
  * Define the address range of the kernel non-linear virtual area
  */
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 1fcfa425cefaf..827ebce4df90e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -93,13 +93,13 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+	return kmem_cache_alloc(PGT_CACHE(PUD_CACHE_INDEX),
 		pgtable_gfp_flags(mm, GFP_KERNEL));
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+	kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -115,7 +115,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 	 * ahead and flush the page walk cache
 	 */
 	flush_tlb_pgtable(tlb, address);
-        pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE);
+	pgtable_free_tlb(tlb, pud, PUD_CACHE_INDEX);
 }
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 51017726d4953..1c8c88e905533 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -232,11 +232,13 @@ extern unsigned long __pmd_index_size;
 extern unsigned long __pud_index_size;
 extern unsigned long __pgd_index_size;
 extern unsigned long __pmd_cache_index;
+extern unsigned long __pud_cache_index;
 #define PTE_INDEX_SIZE  __pte_index_size
 #define PMD_INDEX_SIZE  __pmd_index_size
 #define PUD_INDEX_SIZE  __pud_index_size
 #define PGD_INDEX_SIZE  __pgd_index_size
 #define PMD_CACHE_INDEX __pmd_cache_index
+#define PUD_CACHE_INDEX __pud_cache_index
 /*
  * Because of use of pte fragments and THP, size of page table
  * are not always derived out of index size above.
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 504a3c36ce5c9..03bbd1149530d 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -24,6 +24,7 @@ extern int icache_44x_need_flush;
 #define PGD_INDEX_SIZE	(32 - PGDIR_SHIFT)
 
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
+#define PUD_CACHE_INDEX	PUD_INDEX_SIZE
 
 #ifndef __ASSEMBLY__
 #define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index abddf5830ad55..5c5f75d005ada 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -27,6 +27,7 @@
 #else
 #define PMD_CACHE_INDEX	PMD_INDEX_SIZE
 #endif
+#define PUD_CACHE_INDEX PUD_INDEX_SIZE
 
 /*
  * Define the address range of the kernel non-linear virtual area
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7d07c7e17db67..cf290d415dcd8 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1008,6 +1008,7 @@ void __init hash__early_init_mmu(void)
 	__pmd_index_size = H_PMD_INDEX_SIZE;
 	__pud_index_size = H_PUD_INDEX_SIZE;
 	__pgd_index_size = H_PGD_INDEX_SIZE;
+	__pud_cache_index = H_PUD_CACHE_INDEX;
 	__pmd_cache_index = H_PMD_CACHE_INDEX;
 	__pte_table_size = H_PTE_TABLE_SIZE;
 	__pmd_table_size = H_PMD_TABLE_SIZE;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index eb8c6c8c4851a..2b656e67f2eaa 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -100,6 +100,6 @@ void pgtable_cache_init(void)
 	 * same size as either the pgd or pmd index except with THP enabled
 	 * on book3s 64
 	 */
-	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
-		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
+	if (PUD_CACHE_INDEX && !PGT_CACHE(PUD_CACHE_INDEX))
+		pgtable_cache_add(PUD_CACHE_INDEX, pud_ctor);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 328ff9abc3334..2e10a964e2908 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -553,6 +553,7 @@ void __init radix__early_init_mmu(void)
 	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
 	__pud_index_size = RADIX_PUD_INDEX_SIZE;
 	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
+	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
 	__pmd_cache_index = RADIX_PMD_INDEX_SIZE;
 	__pte_table_size = RADIX_PTE_TABLE_SIZE;
 	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index d75dd5273d15f..28c980eb44222 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -82,6 +82,8 @@ unsigned long __pgd_index_size;
 EXPORT_SYMBOL(__pgd_index_size);
 unsigned long __pmd_cache_index;
 EXPORT_SYMBOL(__pmd_cache_index);
+unsigned long __pud_cache_index;
+EXPORT_SYMBOL(__pud_cache_index);
 unsigned long __pte_table_size;
 EXPORT_SYMBOL(__pte_table_size);
 unsigned long __pmd_table_size;

From 4a7aa4fecbbf94b5c6fae8acccc983d919992bde Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 11 Feb 2018 20:30:07 +0530
Subject: [PATCH 559/676] powerpc/mm/hash64: Allocate larger PMD table if
 hugetlb config is enabled

We use the second half of the page table to store slot information, so we must
allocate it always if hugetlb is possible.

Fixes: bf9a95f9a648 ("powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 +-
 arch/powerpc/include/asm/book3s/64/hash.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index c08b3b032ec0a..ee440fb3d240e 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -140,7 +140,7 @@ static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long a
 }
 
 #define H_PTE_TABLE_SIZE	PTE_FRAG_SIZE
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined (CONFIG_HUGETLB_PAGE)
 #define H_PMD_TABLE_SIZE	((sizeof(pmd_t) << PMD_INDEX_SIZE) + \
 				 (sizeof(unsigned long) << PMD_INDEX_SIZE))
 #else
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index a889457542e86..935adcd92a816 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -23,7 +23,8 @@
 				 H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT)
 #define H_PGTABLE_RANGE		(ASM_CONST(1) << H_PGTABLE_EADDR_SIZE)
 
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&  defined(CONFIG_PPC_64K_PAGES)
+#if (defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)) && \
+	defined(CONFIG_PPC_64K_PAGES)
 /*
  * only with hash 64k we need to use the second half of pmd page table
  * to store pointer to deposited pgtable_t

From ff31e105464d8c8c973019646827020aed9c2d9f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 11 Feb 2018 20:30:08 +0530
Subject: [PATCH 560/676] powerpc/mm/hash64: Store the slot information at the
 right offset for hugetlb

The hugetlb pte entries are at the PMD and PUD level, so we can't use
PTRS_PER_PTE to find the second half of the page table. Use the right
offset for PUD/PMD to get to the second half of the table.

Fixes: bf9a95f9a648 ("powerpc: Free up four 64K PTE bits in 64K backed HPTE pages")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  3 ++-
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  9 +++++----
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  2 +-
 arch/powerpc/mm/hash64_4k.c                   |  4 ++--
 arch/powerpc/mm/hash64_64k.c                  |  8 ++++----
 arch/powerpc/mm/hugetlbpage-hash64.c          | 10 +++++++---
 arch/powerpc/mm/tlb_hash64.c                  |  9 +++++++--
 7 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 949d691094a46..67c5475311ee6 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -63,7 +63,8 @@ static inline int hash__hugepd_ok(hugepd_t hpd)
  * keeping the prototype consistent across the two formats.
  */
 static inline unsigned long pte_set_hidx(pte_t *ptep, real_pte_t rpte,
-			unsigned int subpg_index, unsigned long hidx)
+					 unsigned int subpg_index, unsigned long hidx,
+					 int offset)
 {
 	return (hidx << H_PAGE_F_GIX_SHIFT) &
 		(H_PAGE_F_SECOND | H_PAGE_F_GIX);
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index ee440fb3d240e..3bcf269f8f554 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -45,7 +45,7 @@
  * generic accessors and iterators here
  */
 #define __real_pte __real_pte
-static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
+static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset)
 {
 	real_pte_t rpte;
 	unsigned long *hidxp;
@@ -59,7 +59,7 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
 	 */
 	smp_rmb();
 
-	hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	hidxp = (unsigned long *)(ptep + offset);
 	rpte.hidx = *hidxp;
 	return rpte;
 }
@@ -86,9 +86,10 @@ static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
  * expected to modify the PTE bits accordingly and commit the PTE to memory.
  */
 static inline unsigned long pte_set_hidx(pte_t *ptep, real_pte_t rpte,
-		unsigned int subpg_index, unsigned long hidx)
+					 unsigned int subpg_index,
+					 unsigned long hidx, int offset)
 {
-	unsigned long *hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+	unsigned long *hidxp = (unsigned long *)(ptep + offset);
 
 	rpte.hidx &= ~HIDX_BITS(0xfUL, subpg_index);
 	*hidxp = rpte.hidx  | HIDX_BITS(HIDX_SHIFT_BY_ONE(hidx), subpg_index);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 1c8c88e905533..a6b9f1d746002 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -350,7 +350,7 @@ extern unsigned long pci_io_base;
  */
 #ifndef __real_pte
 
-#define __real_pte(e,p)		((real_pte_t){(e)})
+#define __real_pte(e, p, o)		((real_pte_t){(e)})
 #define __rpte_to_pte(r)	((r).pte)
 #define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT)
 
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 5a69b51d08a36..d573d7d07f25f 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -55,7 +55,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 * need to add in 0x1 if it's a read-only user page
 	 */
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 
 	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -117,7 +117,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 2253bbc6a599d..e601d95c3b202 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -86,7 +86,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 
 	subpg_index = (ea & (PAGE_SIZE - 1)) >> shift;
 	vpn  = hpt_vpn(ea, vsid, ssize);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 	/*
 	 *None of the sub 4k page is hashed
 	 */
@@ -214,7 +214,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		return -1;
 	}
 
-	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot);
+	new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
 	new_pte |= H_PAGE_HASHPTE;
 
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
@@ -262,7 +262,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	rpte = __real_pte(__pte(old_pte), ptep, PTRS_PER_PTE);
 
 	if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
 	    !cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -327,7 +327,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 		}
 
 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
 	}
 	*ptep = __pte(new_pte & ~H_PAGE_BUSY);
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 12511f5a015fc..b320f5097a061 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -27,7 +27,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	unsigned long vpn;
 	unsigned long old_pte, new_pte;
 	unsigned long rflags, pa, sz;
-	long slot;
+	long slot, offset;
 
 	BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
 
@@ -63,7 +63,11 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 	} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
 	rflags = htab_convert_pte_flags(new_pte);
-	rpte = __real_pte(__pte(old_pte), ptep);
+	if (unlikely(mmu_psize == MMU_PAGE_16G))
+		offset = PTRS_PER_PUD;
+	else
+		offset = PTRS_PER_PMD;
+	rpte = __real_pte(__pte(old_pte), ptep, offset);
 
 	sz = ((1UL) << shift);
 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -104,7 +108,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 
-		new_pte |= pte_set_hidx(ptep, rpte, 0, slot);
+		new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
 	}
 
 	/*
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 881ebd53ffc27..9b23f12e863cc 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -51,7 +51,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	unsigned int psize;
 	int ssize;
 	real_pte_t rpte;
-	int i;
+	int i, offset;
 
 	i = batch->index;
 
@@ -67,6 +67,10 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		psize = get_slice_psize(mm, addr);
 		/* Mask the address for the correct page size */
 		addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+		if (unlikely(psize == MMU_PAGE_16G))
+			offset = PTRS_PER_PUD;
+		else
+			offset = PTRS_PER_PMD;
 #else
 		BUG();
 		psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
@@ -78,6 +82,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 		 * support 64k pages, this might be different from the
 		 * hardware page size encoded in the slice table. */
 		addr &= PAGE_MASK;
+		offset = PTRS_PER_PTE;
 	}
 
 
@@ -91,7 +96,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 	}
 	WARN_ON(vsid == 0);
 	vpn = hpt_vpn(addr, vsid, ssize);
-	rpte = __real_pte(__pte(pte), ptep);
+	rpte = __real_pte(__pte(pte), ptep, offset);
 
 	/*
 	 * Check if we have an active batch on this CPU. If not, just

From fc5c2f4a55a2c258e12013cdf287cf266dbcd2a7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 13 Feb 2018 16:39:33 +0530
Subject: [PATCH 561/676] powerpc/mm/hash64: Zero PGD pages on allocation

On powerpc we allocate page table pages from slab caches of different
sizes. Currently we have a constructor that zeroes out the objects when
we allocate them for the first time.

We expect the objects to be zeroed out when we free the the object
back to slab cache. This happens in the unmap path. For hugetlb pages
we call huge_pte_get_and_clear() to do that.

With the current configuration of page table size, both PUD and PGD
level tables are allocated from the same slab cache. At the PUD level,
we use the second half of the table to store the slot information. But
we never clear that when unmapping.

When such a freed object is then allocated for a PGD page, the second
half of the page table page will not be zeroed as expected. This
results in a kernel crash.

Fix it by always clearing PGD pages when they're allocated.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Change log wording and formatting, add whitespace]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 827ebce4df90e..4746bc68d446d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -73,10 +73,16 @@ static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
+	pgd_t *pgd;
+
 	if (radix_enabled())
 		return radix__pgd_alloc(mm);
-	return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
-		pgtable_gfp_flags(mm, GFP_KERNEL));
+
+	pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
+			       pgtable_gfp_flags(mm, GFP_KERNEL));
+	memset(pgd, 0, PGD_TABLE_SIZE);
+
+	return pgd;
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)

From 82343484a2d4c97a03bfd81303b5493c65f05c50 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 12 Feb 2018 14:34:08 -0800
Subject: [PATCH 562/676] powerpc/pseries: Fix build break for SPLPAR=n and CPU
 hotplug

Commit e67e02a544e9 ("powerpc/pseries: Fix cpu hotplug crash with
memoryless nodes") adds an unconditional call to
find_and_online_cpu_nid(), which is only declared if CONFIG_PPC_SPLPAR
is enabled. This results in the following build error if this is not
the case.

  arch/powerpc/platforms/pseries/hotplug-cpu.o: In function `dlpar_online_cpu':
  arch/powerpc/platforms/pseries/hotplug-cpu.c:369:
  			undefined reference to `.find_and_online_cpu_nid'

Follow the guideline provided by similar functions and provide a dummy
function if CONFIG_PPC_SPLPAR is not enabled. This also moves the
external function declaration into an include file where it should be.

Fixes: e67e02a544e9 ("powerpc/pseries: Fix cpu hotplug crash with memoryless nodes")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
[mpe: Change subject to emphasise the build fix]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/topology.h          | 5 +++++
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 --
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 1c02e6900f785..5932481109020 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,6 +87,7 @@ static inline int numa_update_cpu_topology(bool cpus_locked)
 extern int start_topology_update(void);
 extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
+extern int find_and_online_cpu_nid(int cpu);
 #else
 static inline int start_topology_update(void)
 {
@@ -100,6 +101,10 @@ static inline int prrn_is_enabled(void)
 {
 	return 0;
 }
+static inline int find_and_online_cpu_nid(int cpu)
+{
+	return 0;
+}
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index f78fd2068d56a..652d3e96b812b 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -342,8 +342,6 @@ static void pseries_remove_processor(struct device_node *np)
 	cpu_maps_update_done();
 }
 
-extern int find_and_online_cpu_nid(int cpu);
-
 static int dlpar_online_cpu(struct device_node *dn)
 {
 	int rc = 0;

From 910961754572a2f4c83ad7e610d180e3e6c29bda Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 12 Feb 2018 14:34:07 -0800
Subject: [PATCH 563/676] powerpc/kdump: Fix powernv build break when
 KEXEC_CORE=n

If KEXEC_CORE is not enabled, powernv builds fail as follows.

  arch/powerpc/platforms/powernv/smp.c: In function 'pnv_smp_cpu_kill_self':
  arch/powerpc/platforms/powernv/smp.c:236:4: error:
  	implicit declaration of function 'crash_ipi_callback'

Add dummy function calls, similar to kdump_in_progress(), to solve the
problem.

Fixes: 4145f358644b ("powernv/kdump: Fix cases where the kdump kernel can get HMI's")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kexec.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 9dcbfa6bbb91e..d8b1e8e7e035b 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -140,6 +140,12 @@ static inline bool kdump_in_progress(void)
 	return false;
 }
 
+static inline void crash_ipi_callback(struct pt_regs *regs) { }
+
+static inline void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
+{
+}
+
 #endif /* CONFIG_KEXEC_CORE */
 #endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */

From ecdf06e1ea5376bba03c155751f6869d3dfaa210 Mon Sep 17 00:00:00 2001
From: Harish <harish@linux.vnet.ibm.com>
Date: Tue, 13 Feb 2018 12:02:55 +0530
Subject: [PATCH 564/676] selftests/powerpc: Fix to use ucontext_t instead of
 struct ucontext
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With glibc 2.26 'struct ucontext' is removed to improve POSIX
compliance, which breaks powerpc/alignment_handler selftest. Fix the
test by using ucontext_t. Tested on ppc, works with older glibc
versions as well.

Fixes the following:
  alignment_handler.c: In function ‘sighandler’:
  alignment_handler.c:68:5: error: dereferencing pointer to incomplete type ‘struct ucontext’
    ucp->uc_mcontext.gp_regs[PT_NIP] += 4;

Signed-off-by: Harish <harish@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/alignment/alignment_handler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
index 39fd362415cfe..0f2698f9fd6d8 100644
--- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c
+++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
@@ -57,7 +57,7 @@ volatile int gotsig;
 
 void sighandler(int sig, siginfo_t *info, void *ctx)
 {
-	struct ucontext *ucp = ctx;
+	ucontext_t *ucp = ctx;
 
 	if (!testing) {
 		signal(sig, SIG_DFL);

From 295cc7eb314eb3321fb6d67ca6f7305f5c50d10f Mon Sep 17 00:00:00 2001
From: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Date: Thu, 8 Feb 2018 09:19:08 -0500
Subject: [PATCH 565/676] x86/smpboot: Fix uncore_pci_remove() indexing bug
 when hot-removing a physical CPU

When a physical CPU is hot-removed, the following warning messages
are shown while the uncore device is removed in uncore_pci_remove():

  WARNING: CPU: 120 PID: 5 at arch/x86/events/intel/uncore.c:988
  uncore_pci_remove+0xf1/0x110
  ...
  CPU: 120 PID: 5 Comm: kworker/u1024:0 Not tainted 4.15.0-rc8 #1
  Workqueue: kacpi_hotplug acpi_hotplug_work_fn
  ...
  Call Trace:
  pci_device_remove+0x36/0xb0
  device_release_driver_internal+0x145/0x210
  pci_stop_bus_device+0x76/0xa0
  pci_stop_root_bus+0x44/0x60
  acpi_pci_root_remove+0x1f/0x80
  acpi_bus_trim+0x54/0x90
  acpi_bus_trim+0x2e/0x90
  acpi_device_hotplug+0x2bc/0x4b0
  acpi_hotplug_work_fn+0x1a/0x30
  process_one_work+0x141/0x340
  worker_thread+0x47/0x3e0
  kthread+0xf5/0x130

When uncore_pci_remove() runs, it tries to get the package ID to
clear the value of uncore_extra_pci_dev[].dev[] by using
topology_phys_to_logical_pkg(). The warning messesages are
shown because topology_phys_to_logical_pkg() returns -1.

  arch/x86/events/intel/uncore.c:
  static void uncore_pci_remove(struct pci_dev *pdev)
  {
  ...
          phys_id = uncore_pcibus_to_physid(pdev->bus);
  ...
                  pkg = topology_phys_to_logical_pkg(phys_id); // returns -1
                  for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
                          if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
                                  uncore_extra_pci_dev[pkg].dev[i] = NULL;
                                  break;
                          }
                  }
                  WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); // <=========== HERE!!

topology_phys_to_logical_pkg() tries to find
cpuinfo_x86->phys_proc_id that matches the phys_pkg argument.

  arch/x86/kernel/smpboot.c:
  int topology_phys_to_logical_pkg(unsigned int phys_pkg)
  {
          int cpu;

          for_each_possible_cpu(cpu) {
                  struct cpuinfo_x86 *c = &cpu_data(cpu);

                  if (c->initialized && c->phys_proc_id == phys_pkg)
                          return c->logical_proc_id;
          }
          return -1;
  }

However, the phys_proc_id was already set to 0 by remove_siblinginfo()
when the CPU was offlined.

So, topology_phys_to_logical_pkg() cannot find the correct
logical_proc_id and always returns -1.

As the result, uncore_pci_remove() calls WARN_ON_ONCE() and the warning
messages are shown.

What is worse is that the bogus 'pkg' index results in two bugs:

 - We dereference uncore_extra_pci_dev[] with a negative index
 - We fail to clean up a stale pointer in uncore_extra_pci_dev[][]

To fix these bugs, remove the clearing of ->phys_proc_id from remove_siblinginfo().

This should not cause any problems, because ->phys_proc_id is not
used after it is hot-removed and it is re-set while hot-adding.

Signed-off-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: yasu.isimatu@gmail.com
Cc: <stable@vger.kernel.org>
Fixes: 30bb9811856f ("x86/topology: Avoid wasting 128k for package id array")
Link: http://lkml.kernel.org/r/ed738d54-0f01-b38b-b794-c31dc118c207@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/smpboot.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6f27facbaa9b0..cfc61e1d45e2d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1430,7 +1430,6 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear(cpu_llc_shared_mask(cpu));
 	cpumask_clear(topology_sibling_cpumask(cpu));
 	cpumask_clear(topology_core_cpumask(cpu));
-	c->phys_proc_id = 0;
 	c->cpu_core_id = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 	recompute_smt_state();

From 627f4a2bdf113ab88abc65cb505c89cbf615eae0 Mon Sep 17 00:00:00 2001
From: Jaedon Shin <jaedon.shin@gmail.com>
Date: Tue, 6 Feb 2018 12:13:21 +0900
Subject: [PATCH 566/676] MIPS: BMIPS: Fix section mismatch warning

Remove the __init annotation from bmips_cpu_setup() to avoid the
following warning.

WARNING: vmlinux.o(.text+0x35c950): Section mismatch in reference from the function brcmstb_pm_s3() to the function .init.text:bmips_cpu_setup()
The function brcmstb_pm_s3() references
the function __init bmips_cpu_setup().
This is often because brcmstb_pm_s3 lacks a __init
annotation or the annotation of bmips_cpu_setup is wrong.

Signed-off-by: Jaedon Shin <jaedon.shin@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: linux-mips@linux-mips.org
Reviewed-by: James Hogan <jhogan@kernel.org>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Patchwork: https://patchwork.linux-mips.org/patch/18589/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kernel/smp-bmips.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c
index 87dcac2447c8d..9d41732a9146a 100644
--- a/arch/mips/kernel/smp-bmips.c
+++ b/arch/mips/kernel/smp-bmips.c
@@ -572,7 +572,7 @@ asmlinkage void __weak plat_wired_tlb_setup(void)
 	 */
 }
 
-void __init bmips_cpu_setup(void)
+void bmips_cpu_setup(void)
 {
 	void __iomem __maybe_unused *cbr = BMIPS_GET_CBR();
 	u32 __maybe_unused cfg;

From 43d1b29b27c76e7454cd6c85bec4d0e9cbb039f3 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Thu, 8 Feb 2018 21:48:22 +0800
Subject: [PATCH 567/676] sched/cpufreq: Remove unused SUGOV_KTHREAD_PRIORITY
 macro

Since schedutil kernel thread directly set priority to 0, the macro
SUGOV_KTHREAD_PRIORITY is not used.  So remove it.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/1518097702-9665-1-git-send-email-leo.yan@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpufreq_schedutil.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index dd062a1c8cf04..7936f548e071e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,8 +19,6 @@
 
 #include "sched.h"
 
-#define SUGOV_KTHREAD_PRIORITY	50
-
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
 	unsigned int rate_limit_us;

From 74eb816b21d520ce37ce8aaf03128ca6067bbe22 Mon Sep 17 00:00:00 2001
From: Progyan Bhattacharya <bprogyan@gmail.com>
Date: Tue, 6 Feb 2018 10:45:23 +0530
Subject: [PATCH 568/676] x86/build: Add arch/x86/tools/insn_decoder_test to
 .gitignore

The file was generated by make command and should not be in the source tree.

Signed-off-by: Progyan Bhattacharya <progyanb@acm.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index aff152c87cf4b..5a82bac5e0bc7 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,6 +1,7 @@
 boot/compressed/vmlinux
 tools/test_get_len
 tools/insn_sanity
+tools/insn_decoder_test
 purgatory/kexec-purgatory.c
 purgatory/purgatory.ro
 

From 67a3ba25aa955198196f40b76b329b3ab9ad415a Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@mips.com>
Date: Thu, 1 Feb 2018 12:37:21 +0100
Subject: [PATCH 569/676] MIPS: Fix incorrect mem=X@Y handling

Commit 73fbc1eba7ff ("MIPS: fix mem=X@Y commandline processing") added a
fix to ensure that the memory range between PHYS_OFFSET and low memory
address specified by mem= cmdline argument is not later processed by
free_all_bootmem.  This change was incorrect for systems where the
commandline specifies more than 1 mem argument, as it will cause all
memory between PHYS_OFFSET and each of the memory offsets to be marked
as reserved, which results in parts of the RAM marked as reserved
(Creator CI20's u-boot has a default commandline argument 'mem=256M@0x0
mem=768M@0x30000000').

Change the behaviour to ensure that only the range between PHYS_OFFSET
and the lowest start address of the memories is marked as protected.

This change also ensures that the range is marked protected even if it's
only defined through the devicetree and not only via commandline
arguments.

Reported-by: Mathieu Malaterre <mathieu.malaterre@gmail.com>
Signed-off-by: Marcin Nowakowski <marcin.nowakowski@mips.com>
Fixes: 73fbc1eba7ff ("MIPS: fix mem=X@Y commandline processing")
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: <stable@vger.kernel.org> # v4.11+
Tested-by: Mathieu Malaterre <malat@debian.org>
Patchwork: https://patchwork.linux-mips.org/patch/18562/
Signed-off-by: James Hogan <jhogan@kernel.org>
---
 arch/mips/kernel/setup.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 85bc601e9a0d4..5f8b0a9e30b3d 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -375,6 +375,7 @@ static void __init bootmem_init(void)
 	unsigned long reserved_end;
 	unsigned long mapstart = ~0UL;
 	unsigned long bootmap_size;
+	phys_addr_t ramstart = (phys_addr_t)ULLONG_MAX;
 	bool bootmap_valid = false;
 	int i;
 
@@ -395,7 +396,8 @@ static void __init bootmem_init(void)
 	max_low_pfn = 0;
 
 	/*
-	 * Find the highest page frame number we have available.
+	 * Find the highest page frame number we have available
+	 * and the lowest used RAM address
 	 */
 	for (i = 0; i < boot_mem_map.nr_map; i++) {
 		unsigned long start, end;
@@ -407,6 +409,8 @@ static void __init bootmem_init(void)
 		end = PFN_DOWN(boot_mem_map.map[i].addr
 				+ boot_mem_map.map[i].size);
 
+		ramstart = min(ramstart, boot_mem_map.map[i].addr);
+
 #ifndef CONFIG_HIGHMEM
 		/*
 		 * Skip highmem here so we get an accurate max_low_pfn if low
@@ -436,6 +440,13 @@ static void __init bootmem_init(void)
 		mapstart = max(reserved_end, start);
 	}
 
+	/*
+	 * Reserve any memory between the start of RAM and PHYS_OFFSET
+	 */
+	if (ramstart > PHYS_OFFSET)
+		add_memory_region(PHYS_OFFSET, ramstart - PHYS_OFFSET,
+				  BOOT_MEM_RESERVED);
+
 	if (min_low_pfn >= max_low_pfn)
 		panic("Incorrect memory mapping !!!");
 	if (min_low_pfn > ARCH_PFN_OFFSET) {
@@ -664,9 +675,6 @@ static int __init early_parse_mem(char *p)
 
 	add_memory_region(start, size, BOOT_MEM_RAM);
 
-	if (start && start > PHYS_OFFSET)
-		add_memory_region(PHYS_OFFSET, start - PHYS_OFFSET,
-				BOOT_MEM_RESERVED);
 	return 0;
 }
 early_param("mem", early_parse_mem);

From c25d99d20ba69824a1e2cc118e04b877cd427afc Mon Sep 17 00:00:00 2001
From: "mike.travis@hpe.com" <mike.travis@hpe.com>
Date: Mon, 5 Feb 2018 16:15:04 -0600
Subject: [PATCH 570/676] x86/platform/UV: Fix GAM Range Table entries less
 than 1GB

The latest UV platforms include the new ApachePass NVDIMMs into the
UV address space.  This has introduced address ranges in the Global
Address Map Table that are less than the previous lowest range, which
was 2GB.  Fix the address calculation so it accommodates address ranges
from bytes to exabytes.

Signed-off-by: Mike Travis <mike.travis@hpe.com>
Reviewed-by: Andrew Banman <andrew.banman@hpe.com>
Reviewed-by: Dimitri Sivanich <dimitri.sivanich@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <russ.anderson@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180205221503.190219903@stormcage.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 46b675aaf20b8..f11910b44638c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1176,16 +1176,25 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 
 	uv_gre_table = gre;
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
+		unsigned long size = ((unsigned long)(gre->limit - lgre)
+					<< UV_GAM_RANGE_SHFT);
+		int order = 0;
+		char suffix[] = " KMGTPE";
+
+		while (size > 9999 && order < sizeof(suffix)) {
+			size /= 1024;
+			order++;
+		}
+
 		if (!index) {
 			pr_info("UV: GAM Range Table...\n");
 			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
 		}
-		pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
+		pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d   %04x  %02x %02x\n",
 			index++,
 			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
 			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
-			((unsigned long)(gre->limit - lgre)) >>
-				(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
+			size, suffix[order],
 			gre->type, gre->nasid, gre->sockid, gre->pnode);
 
 		lgre = gre->limit;

From 01684e72f16727e6ae0aeb1392f478e11ec5b8f7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 2 Feb 2018 15:56:19 +0100
Subject: [PATCH 571/676] x86/error_inject: Make just_return_func() globally
 visible

With link time optimizations enabled, I get a link failure:

  ./ccLbOEHX.ltrans19.ltrans.o: In function `override_function_with_return':
  <artificial>:(.text+0x7f3): undefined reference to `just_return_func'

Marking the symbol .globl makes it work as expected.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicolas Pitre <nico@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: 540adea3809f ("error-injection: Separate error-injection from kprobe")
Link: http://lkml.kernel.org/r/20180202145634.200291-3-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/error-inject.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index 7b881d03d0ddd..3cdf06128d13c 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -7,6 +7,7 @@ asmlinkage void just_return_func(void);
 
 asm(
 	".type just_return_func, @function\n"
+	".globl just_return_func\n"
 	"just_return_func:\n"
 	"	ret\n"
 	".size just_return_func, .-just_return_func\n"

From 95bcade33a8af38755c9b0636e36a36ad3789fe6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 13 Feb 2018 13:22:56 +0000
Subject: [PATCH 572/676] locking/qspinlock: Ensure node is initialised before
 updating prev->next

When a locker ends up queuing on the qspinlock locking slowpath, we
initialise the relevant mcs node and publish it indirectly by updating
the tail portion of the lock word using xchg_tail. If we find that there
was a pre-existing locker in the queue, we subsequently update their
->next field to point at our node so that we are notified when it's our
turn to take the lock.

This can be roughly illustrated as follows:

  /* Initialise the fields in node and encode a pointer to node in tail */
  tail = initialise_node(node);

  /*
   * Exchange tail into the lockword using an atomic read-modify-write
   * operation with release semantics
   */
  old = xchg_tail(lock, tail);

  /* If there was a pre-existing waiter ... */
  if (old & _Q_TAIL_MASK) {
	prev = decode_tail(old);
	smp_read_barrier_depends();

	/* ... then update their ->next field to point to node.
	WRITE_ONCE(prev->next, node);
  }

The conditional update of prev->next therefore relies on the address
dependency from the result of xchg_tail ensuring order against the
prior initialisation of node. However, since the release semantics of
the xchg_tail operation apply only to the write portion of the RmW,
then this ordering is not guaranteed and it is possible for the CPU
to return old before the writes to node have been published, consequently
allowing us to point prev->next to an uninitialised node.

This patch fixes the problem by making the update of prev->next a RELEASE
operation, which also removes the reliance on dependency ordering.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518528177-19169-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38ece035039e3..348c8cec10426 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -408,14 +408,15 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 */
 	if (old & _Q_TAIL_MASK) {
 		prev = decode_tail(old);
+
 		/*
-		 * The above xchg_tail() is also a load of @lock which
-		 * generates, through decode_tail(), a pointer.  The address
-		 * dependency matches the RELEASE of xchg_tail() such that
-		 * the subsequent access to @prev happens after.
+		 * We must ensure that the stores to @node are observed before
+		 * the write to prev->next. The address dependency from
+		 * xchg_tail is not sufficient to ensure this because the read
+		 * component of xchg_tail is unordered with respect to the
+		 * initialisation of @node.
 		 */
-
-		WRITE_ONCE(prev->next, node);
+		smp_store_release(&prev->next, node);
 
 		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);

From 11dc13224c975efcec96647a4768a6f1bb7a19a8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 13 Feb 2018 13:22:57 +0000
Subject: [PATCH 573/676] locking/qspinlock: Ensure node->count is updated
 before initialising node

When queuing on the qspinlock, the count field for the current CPU's head
node is incremented. This needn't be atomic because locking in e.g. IRQ
context is balanced and so an IRQ will return with node->count as it
found it.

However, the compiler could in theory reorder the initialisation of
node[idx] before the increment of the head node->count, causing an
IRQ to overwrite the initialised node and potentially corrupt the lock
state.

Avoid the potential for this harmful compiler reordering by placing a
barrier() between the increment of the head node->count and the subsequent
node initialisation.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518528177-19169-3-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 348c8cec10426..d880296245c59 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -379,6 +379,14 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	tail = encode_tail(smp_processor_id(), idx);
 
 	node += idx;
+
+	/*
+	 * Ensure that we increment the head node->count before initialising
+	 * the actual node. If the compiler is kind enough to reorder these
+	 * stores, then an IRQ could overwrite our assignments.
+	 */
+	barrier();
+
 	node->locked = 0;
 	node->next = NULL;
 	pv_init_node(node);

From 61e02392d3c7ecac1f91c0a90a8043d67e081846 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 13 Feb 2018 13:30:19 +0000
Subject: [PATCH 574/676] locking/atomic/bitops: Document and clarify ordering
 semantics for failed test_and_{}_bit()

A test_and_{}_bit() operation fails if the value of the bit is such that
the modification does not take place. For example, if test_and_set_bit()
returns 1. In these cases, follow the behaviour of cmpxchg and allow the
operation to be unordered. This also applies to test_and_set_bit_lock()
if the lock is found to be be taken already.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518528619-20049-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/atomic_bitops.txt   | 7 ++++++-
 include/asm-generic/bitops/lock.h | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
index 5550bfdcce5f1..be70b32c95d91 100644
--- a/Documentation/atomic_bitops.txt
+++ b/Documentation/atomic_bitops.txt
@@ -58,7 +58,12 @@ Like with atomic_t, the rule of thumb is:
 
  - RMW operations that have a return value are fully ordered.
 
-Except for test_and_set_bit_lock() which has ACQUIRE semantics and
+ - RMW operations that are conditional are unordered on FAILURE,
+   otherwise the above rules apply. In the case of test_and_{}_bit() operations,
+   if the bit in memory is unchanged by the operation then it is deemed to have
+   failed.
+
+Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
 clear_bit_unlock() which has RELEASE semantics.
 
 Since a platform only has a single means of achieving atomic operations
diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h
index bc397573c43ad..67ab280ad1340 100644
--- a/include/asm-generic/bitops/lock.h
+++ b/include/asm-generic/bitops/lock.h
@@ -7,7 +7,8 @@
  * @nr: Bit to set
  * @addr: Address to count from
  *
- * This operation is atomic and provides acquire barrier semantics.
+ * This operation is atomic and provides acquire barrier semantics if
+ * the returned value is 0.
  * It can be used to implement bit locks.
  */
 #define test_and_set_bit_lock(nr, addr)	test_and_set_bit(nr, addr)

From 2dd6fd2e999774041397f2a7da2e1d30b3a27c3a Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho@tycho.ws>
Date: Thu, 1 Feb 2018 12:41:19 +0100
Subject: [PATCH 575/676] locking/semaphore: Update the file path in
 documentation

While reading this header I noticed that the locking stuff has moved to
kernel/locking/*, so update the path in semaphore.h to point to that.

Signed-off-by: Tycho Andersen <tycho@tycho.ws>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180201114119.1090-1-tycho@tycho.ws
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/semaphore.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h
index dc368b8ce215c..11c86fbfeb985 100644
--- a/include/linux/semaphore.h
+++ b/include/linux/semaphore.h
@@ -4,7 +4,7 @@
  *
  * Distributed under the terms of the GNU GPL, version 2
  *
- * Please see kernel/semaphore.c for documentation of these functions
+ * Please see kernel/locking/semaphore.c for documentation of these functions
  */
 #ifndef __LINUX_SEMAPHORE_H
 #define __LINUX_SEMAPHORE_H

From fd0e786d9d09024f67bd71ec094b110237dc3840 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 25 Jan 2018 14:23:48 -0800
Subject: [PATCH 576/676] x86/mm, mm/hwpoison: Don't unconditionally unmap
 kernel 1:1 pages

In the following commit:

  ce0fa3e56ad2 ("x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages")

... we added code to memory_failure() to unmap the page from the
kernel 1:1 virtual address space to avoid speculative access to the
page logging additional errors.

But memory_failure() may not always succeed in taking the page offline,
especially if the page belongs to the kernel.  This can happen if
there are too many corrected errors on a page and either mcelog(8)
or drivers/ras/cec.c asks to take a page offline.

Since we remove the 1:1 mapping early in memory_failure(), we can
end up with the page unmapped, but still in use. On the next access
the kernel crashes :-(

There are also various debug paths that call memory_failure() to simulate
occurrence of an error. Since there is no actual error in memory, we
don't need to map out the page for those cases.

Revert most of the previous attempt and keep the solution local to
arch/x86/kernel/cpu/mcheck/mce.c. Unmap the page only when:

	1) there is a real error
	2) memory_failure() succeeds.

All of this only applies to 64-bit systems. 32-bit kernel doesn't map
all of memory into kernel space. It isn't worth adding the code to unmap
the piece that is mapped because nobody would run a 32-bit kernel on a
machine that has recoverable machine checks.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert (Persistent Memory) <elliott@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org #v4.14
Fixes: ce0fa3e56ad2 ("x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/page_64.h            |  4 ----
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 15 +++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c          | 17 +++++++++++------
 include/linux/mm_inline.h                 |  6 ------
 mm/memory-failure.c                       |  2 --
 5 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4baa6bceb2325..d652a38080659 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -52,10 +52,6 @@ static inline void clear_page(void *page)
 
 void copy_page(void *to, void *from);
 
-#ifdef CONFIG_X86_MCE
-#define arch_unmap_kpfn arch_unmap_kpfn
-#endif
-
 #endif	/* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index aa0d5df9dc60e..e956eb2670619 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -115,4 +115,19 @@ static inline void mce_unregister_injector_chain(struct notifier_block *nb)	{ }
 
 extern struct mca_config mca_cfg;
 
+#ifndef CONFIG_X86_64
+/*
+ * On 32-bit systems it would be difficult to safely unmap a poison page
+ * from the kernel 1:1 map because there are no non-canonical addresses that
+ * we can use to refer to the address without risking a speculative access.
+ * However, this isn't much of an issue because:
+ * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which
+ *    are only mapped into the kernel as needed
+ * 2) Few people would run a 32-bit kernel on a machine that supports
+ *    recoverable errors because they have too much memory to boot 32-bit.
+ */
+static inline void mce_unmap_kpfn(unsigned long pfn) {}
+#define mce_unmap_kpfn mce_unmap_kpfn
+#endif
+
 #endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 75f405ac085c5..8ff94d1e2dce5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,6 +105,10 @@ static struct irq_work mce_irq_work;
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn);
+#endif
+
 /*
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
@@ -590,7 +594,8 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 
 	if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 		pfn = mce->addr >> PAGE_SHIFT;
-		memory_failure(pfn, 0);
+		if (!memory_failure(pfn, 0))
+			mce_unmap_kpfn(pfn);
 	}
 
 	return NOTIFY_OK;
@@ -1057,12 +1062,13 @@ static int do_memory_failure(struct mce *m)
 	ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
 	if (ret)
 		pr_err("Memory error not recovered");
+	else
+		mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
 	return ret;
 }
 
-#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
-
-void arch_unmap_kpfn(unsigned long pfn)
+#ifndef mce_unmap_kpfn
+static void mce_unmap_kpfn(unsigned long pfn)
 {
 	unsigned long decoy_addr;
 
@@ -1073,7 +1079,7 @@ void arch_unmap_kpfn(unsigned long pfn)
 	 * We would like to just call:
 	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
 	 * but doing that would radically increase the odds of a
-	 * speculative access to the posion page because we'd have
+	 * speculative access to the poison page because we'd have
 	 * the virtual address of the kernel 1:1 mapping sitting
 	 * around in registers.
 	 * Instead we get tricky.  We create a non-canonical address
@@ -1098,7 +1104,6 @@ void arch_unmap_kpfn(unsigned long pfn)
 
 	if (set_memory_np(decoy_addr, 1))
 		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
-
 }
 #endif
 
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index c30b32e3c8624..10191c28fc04c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -127,10 +127,4 @@ static __always_inline enum lru_list page_lru(struct page *page)
 
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 
-#ifdef arch_unmap_kpfn
-extern void arch_unmap_kpfn(unsigned long pfn);
-#else
-static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
-#endif
-
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4b80ccee4535f..8291b75f42c84 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1139,8 +1139,6 @@ int memory_failure(unsigned long pfn, int flags)
 		return 0;
 	}
 
-	arch_unmap_kpfn(pfn);
-
 	orig_head = hpage = compound_head(p);
 	num_poisoned_pages_inc();
 

From 67b4110f8c8d16e588d7730db8e8b01b32c1bd8b Mon Sep 17 00:00:00 2001
From: Nitesh Shetty <nj.shetty@samsung.com>
Date: Tue, 13 Feb 2018 21:18:12 +0530
Subject: [PATCH 577/676] blk: optimization for classic polling

This removes the dependency on interrupts to wake up task. Set task
state as TASK_RUNNING, if need_resched() returns true,
while polling for IO completion.
Earlier, polling task used to sleep, relying on interrupt to wake it up.
This made some IO take very long when interrupt-coalescing is enabled in
NVMe.

Reference:
http://lists.infradead.org/pipermail/linux-nvme/2018-February/015435.html

Changes since v2->v3:
	-using __set_current_state() instead of set_current_state()

Changes since v1->v2:
	-setting task state once in blk_poll, instead of multiple
callers.

Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index df93102e21494..357492712b0ea 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3164,6 +3164,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		cpu_relax();
 	}
 
+	__set_current_state(TASK_RUNNING);
 	return false;
 }
 

From 7bcfab202ca71bece02b283cdd104301c07eece4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 13 Feb 2018 07:44:34 -0800
Subject: [PATCH 578/676] powerpc/macio: set a proper dma_coherent_mask

We have expected busses to set up a coherent mask to properly use the
common dma mapping code for a long time, and now that I've added a warning
macio turned out to not set one up yet.  This sets it to the same value
as the dma_mask, which seems to be what the drivers expect.

Reported-by: Mathieu Malaterre <malat@debian.org>
Tested-by: Mathieu Malaterre <malat@debian.org>
Reported-by: Meelis Roos <mroos@linux.ee>
Tested-by: Meelis Roos <mroos@linux.ee>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/macintosh/macio_asic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 62f541f968f6f..07074820a1674 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -375,6 +375,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip,
 	dev->ofdev.dev.of_node = np;
 	dev->ofdev.archdata.dma_mask = 0xffffffffUL;
 	dev->ofdev.dev.dma_mask = &dev->ofdev.archdata.dma_mask;
+	dev->ofdev.dev.coherent_dma_mask = dev->ofdev.archdata.dma_mask;
 	dev->ofdev.dev.parent = parent;
 	dev->ofdev.dev.bus = &macio_bus_type;
 	dev->ofdev.dev.release = macio_release_dev;

From 49edd5bf429c405b3a7f75503845d9f66a47dd4b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 6 Feb 2018 07:20:55 -0700
Subject: [PATCH 579/676] gfs2: Fixes to "Implement iomap for block_map"

It turns out that commit 3974320ca6 "Implement iomap for block_map"
introduced a few bugs that trigger occasional failures with xfstest
generic/476:

In gfs2_iomap_begin, we jump to do_alloc when we determine that we are
beyond the end of the allocated metadata (height > ip->i_height).
There, we can end up calling hole_size with a metapath that doesn't
match the current metadata tree, which doesn't make sense.  After
untangling the code at do_alloc, fix this by checking if the block we
are looking for is within the range of allocated metadata.

In addition, add a BUG() in case gfs2_iomap_begin is accidentally called
for reading stuffed files: this is handled separately.  Make sure we
don't truncate iomap->length for reads beyond the end of the file; in
that case, the entire range counts as a hole.

Finally, revert to taking a bitmap write lock when doing allocations.
It's unclear why that change didn't lead to any failures during testing.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/bmap.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 86863792f36ae..86d6a4435c87c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -716,7 +716,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	__be64 *ptr;
 	sector_t lblock;
 	sector_t lend;
-	int ret;
+	int ret = 0;
 	int eob;
 	unsigned int len;
 	struct buffer_head *bh;
@@ -728,12 +728,14 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 		goto out;
 	}
 
-	if ((flags & IOMAP_REPORT) && gfs2_is_stuffed(ip)) {
-		gfs2_stuffed_iomap(inode, iomap);
-		if (pos >= iomap->length)
-			return -ENOENT;
-		ret = 0;
-		goto out;
+	if (gfs2_is_stuffed(ip)) {
+		if (flags & IOMAP_REPORT) {
+			gfs2_stuffed_iomap(inode, iomap);
+			if (pos >= iomap->length)
+				ret = -ENOENT;
+			goto out;
+		}
+		BUG_ON(!(flags & IOMAP_WRITE));
 	}
 
 	lblock = pos >> inode->i_blkbits;
@@ -744,7 +746,7 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	iomap->type = IOMAP_HOLE;
 	iomap->length = (u64)(lend - lblock) << inode->i_blkbits;
 	iomap->flags = IOMAP_F_MERGED;
-	bmap_lock(ip, 0);
+	bmap_lock(ip, flags & IOMAP_WRITE);
 
 	/*
 	 * Directory data blocks have a struct gfs2_meta_header header, so the
@@ -787,27 +789,28 @@ int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 		iomap->flags |= IOMAP_F_BOUNDARY;
 	iomap->length = (u64)len << inode->i_blkbits;
 
-	ret = 0;
-
 out_release:
 	release_metapath(&mp);
-	bmap_unlock(ip, 0);
+	bmap_unlock(ip, flags & IOMAP_WRITE);
 out:
 	trace_gfs2_iomap_end(ip, iomap, ret);
 	return ret;
 
 do_alloc:
-	if (!(flags & IOMAP_WRITE)) {
-		if (pos >= i_size_read(inode)) {
+	if (flags & IOMAP_WRITE) {
+		ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+	} else if (flags & IOMAP_REPORT) {
+		loff_t size = i_size_read(inode);
+		if (pos >= size)
 			ret = -ENOENT;
-			goto out_release;
-		}
-		ret = 0;
-		iomap->length = hole_size(inode, lblock, &mp);
-		goto out_release;
+		else if (height <= ip->i_height)
+			iomap->length = hole_size(inode, lblock, &mp);
+		else
+			iomap->length = size - pos;
+	} else {
+		if (height <= ip->i_height)
+			iomap->length = hole_size(inode, lblock, &mp);
 	}
-
-	ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
 	goto out_release;
 }
 

From 3fd176b754e992e1cdf1693ea8184626d1ed7671 Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Mon, 12 Feb 2018 20:54:45 +0800
Subject: [PATCH 580/676] nvme: fix the deadlock in nvme_update_formats

nvme_update_formats will invoke nvme_ns_remove under namespaces_mutext.
The will cause deadlock because nvme_ns_remove will also require
the namespaces_mutext. Fix it by getting the ns entries which should
be removed under namespaces_mutext and invoke nvme_ns_remove out of
namespaces_mutext.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6d0490b477c96..52b3626fb64ed 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1117,14 +1117,19 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 
 static void nvme_update_formats(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(rm_list);
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list) {
-		if (ns->disk && nvme_revalidate_disk(ns->disk))
-			nvme_ns_remove(ns);
+		if (ns->disk && nvme_revalidate_disk(ns->disk)) {
+			list_move_tail(&ns->list, &rm_list);
+		}
 	}
 	mutex_unlock(&ctrl->namespaces_mutex);
+
+	list_for_each_entry_safe(ns, next, &rm_list, list)
+		nvme_ns_remove(ns);
 }
 
 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)

From 815c6704bf9f1c59f3a6be380a4032b9c57b12f1 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 13 Feb 2018 05:44:44 -0700
Subject: [PATCH 581/676] nvme-pci: Remap CMB SQ entries on every controller
 reset

The controller memory buffer is remapped into a kernel address on each
reset, but the driver was setting the submission queue base address
only on the very first queue creation. The remapped address is likely to
change after a reset, so accessing the old address will hit a kernel bug.

This patch fixes that by setting the queue's CMB base address each time
the queue is created.

Fixes: f63572dff1421 ("nvme: unmap CMB and remove sysfs file in reset path")
Reported-by: Christian Black <christian.d.black@intel.com>
Cc: Jon Derrick <jonathan.derrick@intel.com>
Cc: <stable@vger.kernel.org> # 4.9+
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ab9c19525fa80..b427157af74e0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1364,18 +1364,14 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 				int qid, int depth)
 {
-	if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
-		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
-						      dev->ctrl.page_size);
-		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-		nvmeq->sq_cmds_io = dev->cmb + offset;
-	} else {
-		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
-		if (!nvmeq->sq_cmds)
-			return -ENOMEM;
-	}
+	/* CMB SQEs will be mapped before creation */
+	if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
+		return 0;
 
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+					    &nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		return -ENOMEM;
 	return 0;
 }
 
@@ -1449,6 +1445,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
 
+	if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+		unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
+						      dev->ctrl.page_size);
+		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
+		nvmeq->sq_cmds_io = dev->cmb + offset;
+	}
+
 	nvmeq->cq_vector = qid - 1;
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)

From 4244140d7b8f406b7edfd01c050dea783aa1efc5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 8 Feb 2018 08:55:34 -0700
Subject: [PATCH 582/676] nvme-pci: Fix timeouts in connecting state

We need to halt the controller immediately if we haven't completed
initialization as indicated by the new "connecting" state.

Fixes: ad70062cdb ("nvme-pci: introduce RECONNECTING state to mark initializing procedure")
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b427157af74e0..73036d2fbbd58 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1215,13 +1215,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 * cancellation error. All outstanding requests are completed on
 	 * shutdown, so we return BLK_EH_HANDLED.
 	 */
-	if (dev->ctrl.state == NVME_CTRL_RESETTING) {
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_CONNECTING:
+	case NVME_CTRL_RESETTING:
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
 		nvme_dev_disable(dev, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		return BLK_EH_HANDLED;
+	default:
+		break;
 	}
 
 	/*

From 117172c8f9d40ba1de8cb35c6e614422faa03330 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 13 Feb 2018 09:01:54 +0000
Subject: [PATCH 583/676] drm/i915/breadcrumbs: Ignore unsubmitted signalers

When a request is preempted, it is unsubmitted from the HW queue and
removed from the active list of breadcrumbs. In the process, this
however triggers the signaler and it may see the clear rbtree with the
old, and still valid, seqno, or it may match the cleared seqno with the
now zero rq->global_seqno. This confuses the signaler into action and
signaling the fence.

Fixes: d6a2289d9d6b ("drm/i915: Remove the preempted request from the execution queue")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: <stable@vger.kernel.org> # v4.12+
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180206094633.30181-1-chris@chris-wilson.co.uk
(cherry picked from commit fd10e2ce9905030d922e179a8047a4d50daffd8e)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180213090154.17373-1-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 29 ++++++++----------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index bd40fea16b4f1..f54ddda9fdada 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -594,29 +594,16 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 	spin_unlock_irq(&b->rb_lock);
 }
 
-static bool signal_valid(const struct drm_i915_gem_request *request)
-{
-	return intel_wait_check_request(&request->signaling.wait, request);
-}
-
 static bool signal_complete(const struct drm_i915_gem_request *request)
 {
 	if (!request)
 		return false;
 
-	/* If another process served as the bottom-half it may have already
-	 * signalled that this wait is already completed.
-	 */
-	if (intel_wait_complete(&request->signaling.wait))
-		return signal_valid(request);
-
-	/* Carefully check if the request is complete, giving time for the
+	/*
+	 * Carefully check if the request is complete, giving time for the
 	 * seqno to be visible or if the GPU hung.
 	 */
-	if (__i915_request_irq_complete(request))
-		return true;
-
-	return false;
+	return __i915_request_irq_complete(request);
 }
 
 static struct drm_i915_gem_request *to_signaler(struct rb_node *rb)
@@ -659,9 +646,13 @@ static int intel_breadcrumbs_signaler(void *arg)
 			request = i915_gem_request_get_rcu(request);
 		rcu_read_unlock();
 		if (signal_complete(request)) {
-			local_bh_disable();
-			dma_fence_signal(&request->fence);
-			local_bh_enable(); /* kick start the tasklets */
+			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+				      &request->fence.flags)) {
+				local_bh_disable();
+				dma_fence_signal(&request->fence);
+				GEM_BUG_ON(!i915_gem_request_completed(request));
+				local_bh_enable(); /* kick start the tasklets */
+			}
 
 			spin_lock_irq(&b->rb_lock);
 

From edb76b01ac1629bfe17158bea56fcc16bfb57854 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 13 Feb 2018 09:57:44 +0000
Subject: [PATCH 584/676] drm/i915: Lock out execlist tasklet while peeking
 inside for busy-stats

In order to prevent a race condition where we may end up overaccounting
the active state and leaving the busy-stats believing the GPU is 100%
busy, lock out the tasklet while we reconstruct the busy state. There is
no direct spinlock guard for the execlists->port[], so we need to
utilise tasklet_disable() as a synchronous barrier to prevent it, the
only writer to execlists->port[], from running at the same time as the
enable.

Fixes: 4900727d35bb ("drm/i915/pmu: Reconstruct active state on starting busy-stats")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180115092041.13509-1-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
(cherry picked from commit 99e48bf98dd036090b480a12c39e8b971731247e)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-1-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/intel_engine_cs.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index acc661aa9c0c4..fa960cfd2764f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1945,16 +1945,22 @@ intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance)
  */
 int intel_enable_engine_stats(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists *execlists = &engine->execlists;
 	unsigned long flags;
+	int err = 0;
 
 	if (!intel_engine_supports_stats(engine))
 		return -ENODEV;
 
+	tasklet_disable(&execlists->tasklet);
 	spin_lock_irqsave(&engine->stats.lock, flags);
-	if (engine->stats.enabled == ~0)
-		goto busy;
+
+	if (unlikely(engine->stats.enabled == ~0)) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
 	if (engine->stats.enabled++ == 0) {
-		struct intel_engine_execlists *execlists = &engine->execlists;
 		const struct execlist_port *port = execlists->port;
 		unsigned int num_ports = execlists_num_ports(execlists);
 
@@ -1969,14 +1975,12 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 		if (engine->stats.active)
 			engine->stats.start = engine->stats.enabled_at;
 	}
-	spin_unlock_irqrestore(&engine->stats.lock, flags);
 
-	return 0;
-
-busy:
+unlock:
 	spin_unlock_irqrestore(&engine->stats.lock, flags);
+	tasklet_enable(&execlists->tasklet);
 
-	return -EBUSY;
+	return err;
 }
 
 static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine)

From d3f84c8b097001e3f31f584b793493cb0033a7ae Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 13 Feb 2018 09:57:45 +0000
Subject: [PATCH 585/676] drm/i915/pmu: Fix PMU enable vs execlists tasklet
 race

Commit 99e48bf98dd0 ("drm/i915: Lock out execlist tasklet while peeking
inside for busy-stats") added a tasklet_disable call in busy stats
enabling, but we failed to understand that the PMU enable callback runs
as an hard IRQ (IPI).

Consequence of this is that the PMU enable callback can interrupt the
execlists tasklet, and will then deadlock when it calls
intel_engine_stats_enable->tasklet_disable.

To fix this, I realized it is possible to move the engine stats enablement
and disablement to PMU event init and destroy hooks. This allows for much
simpler implementation since those hooks run in normal context (can
sleep).

v2: Extract engine_event_destroy. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Fixes: 99e48bf98dd0 ("drm/i915: Lock out execlist tasklet while peeking inside for busy-stats")
Testcase: igt/perf_pmu/enable-race-*
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: intel-gfx@lists.freedesktop.org
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180205093448.13877-1-tvrtko.ursulin@linux.intel.com
(cherry picked from commit b2f78cda260bc6a1a2d382b1d85a29e69b5b3724)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-2-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_pmu.c         | 125 ++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  14 ---
 2 files changed, 52 insertions(+), 87 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 55a8a1e294248..337eaa6ede527 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -285,26 +285,41 @@ static u64 count_interrupts(struct drm_i915_private *i915)
 	return sum;
 }
 
-static void i915_pmu_event_destroy(struct perf_event *event)
+static void engine_event_destroy(struct perf_event *event)
 {
-	WARN_ON(event->parent);
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), pmu.base);
+	struct intel_engine_cs *engine;
+
+	engine = intel_engine_lookup_user(i915,
+					  engine_event_class(event),
+					  engine_event_instance(event));
+	if (WARN_ON_ONCE(!engine))
+		return;
+
+	if (engine_event_sample(event) == I915_SAMPLE_BUSY &&
+	    intel_engine_supports_stats(engine))
+		intel_disable_engine_stats(engine);
 }
 
-static int engine_event_init(struct perf_event *event)
+static void i915_pmu_event_destroy(struct perf_event *event)
 {
-	struct drm_i915_private *i915 =
-		container_of(event->pmu, typeof(*i915), pmu.base);
+	WARN_ON(event->parent);
 
-	if (!intel_engine_lookup_user(i915, engine_event_class(event),
-				      engine_event_instance(event)))
-		return -ENODEV;
+	if (is_engine_event(event))
+		engine_event_destroy(event);
+}
 
-	switch (engine_event_sample(event)) {
+static int
+engine_event_status(struct intel_engine_cs *engine,
+		    enum drm_i915_pmu_engine_sample sample)
+{
+	switch (sample) {
 	case I915_SAMPLE_BUSY:
 	case I915_SAMPLE_WAIT:
 		break;
 	case I915_SAMPLE_SEMA:
-		if (INTEL_GEN(i915) < 6)
+		if (INTEL_GEN(engine->i915) < 6)
 			return -ENODEV;
 		break;
 	default:
@@ -314,6 +329,30 @@ static int engine_event_init(struct perf_event *event)
 	return 0;
 }
 
+static int engine_event_init(struct perf_event *event)
+{
+	struct drm_i915_private *i915 =
+		container_of(event->pmu, typeof(*i915), pmu.base);
+	struct intel_engine_cs *engine;
+	u8 sample;
+	int ret;
+
+	engine = intel_engine_lookup_user(i915, engine_event_class(event),
+					  engine_event_instance(event));
+	if (!engine)
+		return -ENODEV;
+
+	sample = engine_event_sample(event);
+	ret = engine_event_status(engine, sample);
+	if (ret)
+		return ret;
+
+	if (sample == I915_SAMPLE_BUSY && intel_engine_supports_stats(engine))
+		ret = intel_enable_engine_stats(engine);
+
+	return ret;
+}
+
 static int i915_pmu_event_init(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
@@ -387,7 +426,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 		if (WARN_ON_ONCE(!engine)) {
 			/* Do nothing */
 		} else if (sample == I915_SAMPLE_BUSY &&
-			   engine->pmu.busy_stats) {
+			   intel_engine_supports_stats(engine)) {
 			val = ktime_to_ns(intel_engine_get_busy_time(engine));
 		} else {
 			val = engine->pmu.sample[sample].cur;
@@ -442,12 +481,6 @@ static void i915_pmu_event_read(struct perf_event *event)
 	local64_add(new - prev, &event->count);
 }
 
-static bool engine_needs_busy_stats(struct intel_engine_cs *engine)
-{
-	return intel_engine_supports_stats(engine) &&
-	       (engine->pmu.enable & BIT(I915_SAMPLE_BUSY));
-}
-
 static void i915_pmu_enable(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
@@ -487,21 +520,7 @@ static void i915_pmu_enable(struct perf_event *event)
 
 		GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS);
 		GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0);
-		if (engine->pmu.enable_count[sample]++ == 0) {
-			/*
-			 * Enable engine busy stats tracking if needed or
-			 * alternatively cancel the scheduled disable.
-			 *
-			 * If the delayed disable was pending, cancel it and
-			 * in this case do not enable since it already is.
-			 */
-			if (engine_needs_busy_stats(engine) &&
-			    !engine->pmu.busy_stats) {
-				engine->pmu.busy_stats = true;
-				if (!cancel_delayed_work(&engine->pmu.disable_busy_stats))
-					intel_enable_engine_stats(engine);
-			}
-		}
+		engine->pmu.enable_count[sample]++;
 	}
 
 	/*
@@ -514,14 +533,6 @@ static void i915_pmu_enable(struct perf_event *event)
 	spin_unlock_irqrestore(&i915->pmu.lock, flags);
 }
 
-static void __disable_busy_stats(struct work_struct *work)
-{
-	struct intel_engine_cs *engine =
-	       container_of(work, typeof(*engine), pmu.disable_busy_stats.work);
-
-	intel_disable_engine_stats(engine);
-}
-
 static void i915_pmu_disable(struct perf_event *event)
 {
 	struct drm_i915_private *i915 =
@@ -545,26 +556,8 @@ static void i915_pmu_disable(struct perf_event *event)
 		 * Decrement the reference count and clear the enabled
 		 * bitmask when the last listener on an event goes away.
 		 */
-		if (--engine->pmu.enable_count[sample] == 0) {
+		if (--engine->pmu.enable_count[sample] == 0)
 			engine->pmu.enable &= ~BIT(sample);
-			if (!engine_needs_busy_stats(engine) &&
-			    engine->pmu.busy_stats) {
-				engine->pmu.busy_stats = false;
-				/*
-				 * We request a delayed disable to handle the
-				 * rapid on/off cycles on events, which can
-				 * happen when tools like perf stat start, in a
-				 * nicer way.
-				 *
-				 * In addition, this also helps with busy stats
-				 * accuracy with background CPU offline/online
-				 * migration events.
-				 */
-				queue_delayed_work(system_wq,
-						   &engine->pmu.disable_busy_stats,
-						   round_jiffies_up_relative(HZ));
-			}
-		}
 	}
 
 	GEM_BUG_ON(bit >= I915_PMU_MASK_BITS);
@@ -797,8 +790,6 @@ static void i915_pmu_unregister_cpuhp_state(struct drm_i915_private *i915)
 
 void i915_pmu_register(struct drm_i915_private *i915)
 {
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
 	int ret;
 
 	if (INTEL_GEN(i915) <= 2) {
@@ -820,10 +811,6 @@ void i915_pmu_register(struct drm_i915_private *i915)
 	hrtimer_init(&i915->pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	i915->pmu.timer.function = i915_sample;
 
-	for_each_engine(engine, i915, id)
-		INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats,
-				  __disable_busy_stats);
-
 	ret = perf_pmu_register(&i915->pmu.base, "i915", -1);
 	if (ret)
 		goto err;
@@ -843,9 +830,6 @@ void i915_pmu_register(struct drm_i915_private *i915)
 
 void i915_pmu_unregister(struct drm_i915_private *i915)
 {
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
 	if (!i915->pmu.base.event_init)
 		return;
 
@@ -853,11 +837,6 @@ void i915_pmu_unregister(struct drm_i915_private *i915)
 
 	hrtimer_cancel(&i915->pmu.timer);
 
-	for_each_engine(engine, i915, id) {
-		GEM_BUG_ON(engine->pmu.busy_stats);
-		flush_delayed_work(&engine->pmu.disable_busy_stats);
-	}
-
 	i915_pmu_unregister_cpuhp_state(i915);
 
 	perf_pmu_unregister(&i915->pmu.base);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c5ff203e42d6a..a0e7a6c2a57cd 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -366,20 +366,6 @@ struct intel_engine_cs {
 		 */
 #define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_SEMA + 1)
 		struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX];
-		/**
-		 * @busy_stats: Has enablement of engine stats tracking been
-		 * 		requested.
-		 */
-		bool busy_stats;
-		/**
-		 * @disable_busy_stats: Work item for busy stats disabling.
-		 *
-		 * Same as with @enable_busy_stats action, with the difference
-		 * that we delay it in case there are rapid enable-disable
-		 * actions, which can happen during tool startup (like perf
-		 * stat).
-		 */
-		struct delayed_work disable_busy_stats;
 	} pmu;
 
 	/*

From 4c83f0a788ccf58864f781585d8ae7c7e6a7e07d Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 13 Feb 2018 09:57:46 +0000
Subject: [PATCH 586/676] drm/i915/pmu: Fix sleep under atomic in RC6 readout

We are not allowed to call intel_runtime_pm_get from the PMU counter read
callback since the former can sleep, and the latter is running under IRQ
context.

To workaround this, we record the last known RC6 and while runtime
suspended estimate its increase by querying the runtime PM core
timestamps.

Downside of this approach is that we can temporarily lose a chunk of RC6
time, from the last PMU read-out to runtime suspend entry, but that will
eventually catch up, once device comes back online and in the presence of
PMU queries.

Also, we have to be careful not to overshoot the RC6 estimate, so once
resumed after a period of approximation, we only update the counter once
it catches up. With the observation that RC6 is increasing while the
device is suspended, this should not pose a problem and can only cause
slight inaccuracies due clock base differences.

v2: Simplify by estimating on top of PM core counters. (Imre)

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104943
Fixes: 6060b6aec03c ("drm/i915/pmu: Add RC6 residency metrics")
Testcase: igt/perf_pmu/rc6-runtime-pm
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: David Airlie <airlied@linux.ie>
Cc: intel-gfx@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20180206183311.17924-1-tvrtko.ursulin@linux.intel.com
(cherry picked from commit 1fe699e30113ed6f6e853ff44710d256072ea627)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-3-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_pmu.c | 93 +++++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_pmu.h |  6 +++
 2 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index 337eaa6ede527..e13859aaa2a31 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -409,7 +409,81 @@ static int i915_pmu_event_init(struct perf_event *event)
 	return 0;
 }
 
-static u64 __i915_pmu_event_read(struct perf_event *event)
+static u64 get_rc6(struct drm_i915_private *i915, bool locked)
+{
+	unsigned long flags;
+	u64 val;
+
+	if (intel_runtime_pm_get_if_in_use(i915)) {
+		val = intel_rc6_residency_ns(i915, IS_VALLEYVIEW(i915) ?
+						   VLV_GT_RENDER_RC6 :
+						   GEN6_GT_GFX_RC6);
+
+		if (HAS_RC6p(i915))
+			val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p);
+
+		if (HAS_RC6pp(i915))
+			val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp);
+
+		intel_runtime_pm_put(i915);
+
+		/*
+		 * If we are coming back from being runtime suspended we must
+		 * be careful not to report a larger value than returned
+		 * previously.
+		 */
+
+		if (!locked)
+			spin_lock_irqsave(&i915->pmu.lock, flags);
+
+		if (val >= i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur) {
+			i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = 0;
+			i915->pmu.sample[__I915_SAMPLE_RC6].cur = val;
+		} else {
+			val = i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur;
+		}
+
+		if (!locked)
+			spin_unlock_irqrestore(&i915->pmu.lock, flags);
+	} else {
+		struct pci_dev *pdev = i915->drm.pdev;
+		struct device *kdev = &pdev->dev;
+		unsigned long flags2;
+
+		/*
+		 * We are runtime suspended.
+		 *
+		 * Report the delta from when the device was suspended to now,
+		 * on top of the last known real value, as the approximated RC6
+		 * counter value.
+		 */
+		if (!locked)
+			spin_lock_irqsave(&i915->pmu.lock, flags);
+
+		spin_lock_irqsave(&kdev->power.lock, flags2);
+
+		if (!i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur)
+			i915->pmu.suspended_jiffies_last =
+						kdev->power.suspended_jiffies;
+
+		val = kdev->power.suspended_jiffies -
+		      i915->pmu.suspended_jiffies_last;
+		val += jiffies - kdev->power.accounting_timestamp;
+
+		spin_unlock_irqrestore(&kdev->power.lock, flags2);
+
+		val = jiffies_to_nsecs(val);
+		val += i915->pmu.sample[__I915_SAMPLE_RC6].cur;
+		i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val;
+
+		if (!locked)
+			spin_unlock_irqrestore(&i915->pmu.lock, flags);
+	}
+
+	return val;
+}
+
+static u64 __i915_pmu_event_read(struct perf_event *event, bool locked)
 {
 	struct drm_i915_private *i915 =
 		container_of(event->pmu, typeof(*i915), pmu.base);
@@ -447,18 +521,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event)
 			val = count_interrupts(i915);
 			break;
 		case I915_PMU_RC6_RESIDENCY:
-			intel_runtime_pm_get(i915);
-			val = intel_rc6_residency_ns(i915,
-						     IS_VALLEYVIEW(i915) ?
-						     VLV_GT_RENDER_RC6 :
-						     GEN6_GT_GFX_RC6);
-			if (HAS_RC6p(i915))
-				val += intel_rc6_residency_ns(i915,
-							      GEN6_GT_GFX_RC6p);
-			if (HAS_RC6pp(i915))
-				val += intel_rc6_residency_ns(i915,
-							      GEN6_GT_GFX_RC6pp);
-			intel_runtime_pm_put(i915);
+			val = get_rc6(i915, locked);
 			break;
 		}
 	}
@@ -473,7 +536,7 @@ static void i915_pmu_event_read(struct perf_event *event)
 
 again:
 	prev = local64_read(&hwc->prev_count);
-	new = __i915_pmu_event_read(event);
+	new = __i915_pmu_event_read(event, false);
 
 	if (local64_cmpxchg(&hwc->prev_count, prev, new) != prev)
 		goto again;
@@ -528,7 +591,7 @@ static void i915_pmu_enable(struct perf_event *event)
 	 * for all listeners. Even when the event was already enabled and has
 	 * an existing non-zero value.
 	 */
-	local64_set(&event->hw.prev_count, __i915_pmu_event_read(event));
+	local64_set(&event->hw.prev_count, __i915_pmu_event_read(event, true));
 
 	spin_unlock_irqrestore(&i915->pmu.lock, flags);
 }
diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
index 40c154d13565a..bb62df15afa4f 100644
--- a/drivers/gpu/drm/i915/i915_pmu.h
+++ b/drivers/gpu/drm/i915/i915_pmu.h
@@ -27,6 +27,8 @@
 enum {
 	__I915_SAMPLE_FREQ_ACT = 0,
 	__I915_SAMPLE_FREQ_REQ,
+	__I915_SAMPLE_RC6,
+	__I915_SAMPLE_RC6_ESTIMATED,
 	__I915_NUM_PMU_SAMPLERS
 };
 
@@ -94,6 +96,10 @@ struct i915_pmu {
 	 * struct intel_engine_cs.
 	 */
 	struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS];
+	/**
+	 * @suspended_jiffies_last: Cached suspend time from PM core.
+	 */
+	unsigned long suspended_jiffies_last;
 };
 
 #ifdef CONFIG_PERF_EVENTS

From 4b8b41d15d9db54703958fbd2928a2fd319563f6 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 13 Feb 2018 09:57:47 +0000
Subject: [PATCH 587/676] drm/i915/pmu: Fix building without CONFIG_PM

As we peek inside struct device to query members guarded by CONFIG_PM,
so must be the code.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Fixes: 1fe699e30113 ("drm/i915/pmu: Fix sleep under atomic in RC6 readout")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180207160428.17015-1-chris@chris-wilson.co.uk
(cherry picked from commit 05273c950a3c93c5f96be8807eaf24f2cc9f1c1e)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-4-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_pmu.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index e13859aaa2a31..0e9b98c32b62b 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -409,22 +409,32 @@ static int i915_pmu_event_init(struct perf_event *event)
 	return 0;
 }
 
-static u64 get_rc6(struct drm_i915_private *i915, bool locked)
+static u64 __get_rc6(struct drm_i915_private *i915)
 {
-	unsigned long flags;
 	u64 val;
 
-	if (intel_runtime_pm_get_if_in_use(i915)) {
-		val = intel_rc6_residency_ns(i915, IS_VALLEYVIEW(i915) ?
-						   VLV_GT_RENDER_RC6 :
-						   GEN6_GT_GFX_RC6);
+	val = intel_rc6_residency_ns(i915,
+				     IS_VALLEYVIEW(i915) ?
+				     VLV_GT_RENDER_RC6 :
+				     GEN6_GT_GFX_RC6);
 
-		if (HAS_RC6p(i915))
-			val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p);
+	if (HAS_RC6p(i915))
+		val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p);
+
+	if (HAS_RC6pp(i915))
+		val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp);
+
+	return val;
+}
 
-		if (HAS_RC6pp(i915))
-			val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp);
+static u64 get_rc6(struct drm_i915_private *i915, bool locked)
+{
+#if IS_ENABLED(CONFIG_PM)
+	unsigned long flags;
+	u64 val;
 
+	if (intel_runtime_pm_get_if_in_use(i915)) {
+		val = __get_rc6(i915);
 		intel_runtime_pm_put(i915);
 
 		/*
@@ -481,6 +491,9 @@ static u64 get_rc6(struct drm_i915_private *i915, bool locked)
 	}
 
 	return val;
+#else
+	return __get_rc6(i915);
+#endif
 }
 
 static u64 __i915_pmu_event_read(struct perf_event *event, bool locked)

From 37ad4e68783088ed61493f54194cfccd3c87ab35 Mon Sep 17 00:00:00 2001
From: Weinan Li <weinan.z.li@intel.com>
Date: Fri, 9 Feb 2018 16:01:34 +0800
Subject: [PATCH 588/676] drm/i915/gvt: add 0xe4f0 into gen9 render list

Guest may set this register on KBL platform, it can impact hardware
behavior, so add it into the gen9 render list. Otherwise gpu hang issue may
happen during different vgpu switch.

v2: separate it from patch set.

Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Signed-off-by: Weinan Li <weinan.z.li@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/mmio_context.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c
index 73ad6e90e49db..256f1bb522b7a 100644
--- a/drivers/gpu/drm/i915/gvt/mmio_context.c
+++ b/drivers/gpu/drm/i915/gvt/mmio_context.c
@@ -118,6 +118,7 @@ static struct engine_mmio gen9_engine_mmio_list[] __cacheline_aligned = {
 	{RCS, HALF_SLICE_CHICKEN3, 0xffff, true}, /* 0xe184 */
 	{RCS, GEN9_HALF_SLICE_CHICKEN5, 0xffff, true}, /* 0xe188 */
 	{RCS, GEN9_HALF_SLICE_CHICKEN7, 0xffff, true}, /* 0xe194 */
+	{RCS, GEN8_ROW_CHICKEN, 0xffff, true}, /* 0xe4f0 */
 	{RCS, TRVATTL3PTRDW(0), 0, false}, /* 0x4de0 */
 	{RCS, TRVATTL3PTRDW(1), 0, false}, /* 0x4de4 */
 	{RCS, TRNULLDETCT, 0, false}, /* 0x4de8 */

From a26ca6ad4c4aa4afcbfe4c46c33ad98859736245 Mon Sep 17 00:00:00 2001
From: Tina Zhang <tina.zhang@intel.com>
Date: Sun, 11 Feb 2018 14:59:19 +0800
Subject: [PATCH 589/676] drm/i915/gvt: Support BAR0 8-byte reads/writes

GGTT is in BAR0 with 8 bytes aligned. With a qemu patch (commit:
38d49e8c1523d97d2191190d3f7b4ce7a0ab5aa3), VFIO can use 8-byte reads/
writes to access it.

This patch is to support the 8-byte GGTT reads/writes.

Ideally, we would like to support 8-byte reads/writes for the total BAR0.
But it needs more work for handling 8-byte MMIO reads/writes.

This patch can fix the issue caused by partial updating GGTT entry, during
guest booting up.

v3:
- Use intel_vgpu_get_bar_gpa() stead. (Zhenyu)
- Include all the GGTT checking logic in gtt_entry(). (Zhenyu)

v2:
- Limit to GGTT entry. (Zhenyu)

Signed-off-by: Tina Zhang <tina.zhang@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 51 ++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 909499b73d03a..021f722e24816 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -733,6 +733,25 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
 	return ret == 0 ? count : ret;
 }
 
+static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos)
+{
+	struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct intel_gvt *gvt = vgpu->gvt;
+	int offset;
+
+	/* Only allow MMIO GGTT entry access */
+	if (index != PCI_BASE_ADDRESS_0)
+		return false;
+
+	offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) -
+		intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0);
+
+	return (offset >= gvt->device_info.gtt_start_offset &&
+		offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ?
+			true : false;
+}
+
 static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
 			size_t count, loff_t *ppos)
 {
@@ -742,7 +761,21 @@ static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
 	while (count) {
 		size_t filled;
 
-		if (count >= 4 && !(*ppos % 4)) {
+		/* Only support GGTT entry 8 bytes read */
+		if (count >= 8 && !(*ppos % 8) &&
+			gtt_entry(mdev, ppos)) {
+			u64 val;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+					ppos, false);
+			if (ret <= 0)
+				goto read_err;
+
+			if (copy_to_user(buf, &val, sizeof(val)))
+				goto read_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
 			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
@@ -802,7 +835,21 @@ static ssize_t intel_vgpu_write(struct mdev_device *mdev,
 	while (count) {
 		size_t filled;
 
-		if (count >= 4 && !(*ppos % 4)) {
+		/* Only support GGTT entry 8 bytes write */
+		if (count >= 8 && !(*ppos % 8) &&
+			gtt_entry(mdev, ppos)) {
+			u64 val;
+
+			if (copy_from_user(&val, buf, sizeof(val)))
+				goto write_err;
+
+			ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+					ppos, true);
+			if (ret <= 0)
+				goto write_err;
+
+			filled = 8;
+		} else if (count >= 4 && !(*ppos % 4)) {
 			u32 val;
 
 			if (copy_from_user(&val, buf, sizeof(val)))

From 3cc7644e4af179e79153b1fd60f9dd937ee32684 Mon Sep 17 00:00:00 2001
From: Weinan Li <weinan.z.li@intel.com>
Date: Mon, 12 Feb 2018 15:28:42 +0800
Subject: [PATCH 590/676] drm/i915/gvt: fix one typo of render_mmio trace

Fix one typo of render_mmio trace, exchange the mmio value of old and new.

Signed-off-by: Weinan Li <weinan.z.li@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
---
 drivers/gpu/drm/i915/gvt/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h
index 7a2511538f340..736bd2bc5127f 100644
--- a/drivers/gpu/drm/i915/gvt/trace.h
+++ b/drivers/gpu/drm/i915/gvt/trace.h
@@ -333,7 +333,7 @@ TRACE_EVENT(render_mmio,
 	TP_PROTO(int old_id, int new_id, char *action, unsigned int reg,
 		 unsigned int old_val, unsigned int new_val),
 
-	TP_ARGS(old_id, new_id, action, reg, new_val, old_val),
+	TP_ARGS(old_id, new_id, action, reg, old_val, new_val),
 
 	TP_STRUCT__entry(
 		__field(int, old_id)

From d15d662e89fc667b90cd294b0eb45694e33144da Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 12 Feb 2018 15:20:51 +0100
Subject: [PATCH 591/676] ALSA: seq: Fix racy pool initializations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ALSA sequencer core initializes the event pool on demand by invoking
snd_seq_pool_init() when the first write happens and the pool is
empty.  Meanwhile user can reset the pool size manually via ioctl
concurrently, and this may lead to UAF or out-of-bound accesses since
the function tries to vmalloc / vfree the buffer.

A simple fix is to just wrap the snd_seq_pool_init() call with the
recently introduced client->ioctl_mutex; as the calls for
snd_seq_pool_init() from other side are always protected with this
mutex, we can avoid the race.

Reported-by: 范龙飞 <long7573@126.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/seq/seq_clientmgr.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c
index 60db32785f622..04d4db44fae5c 100644
--- a/sound/core/seq/seq_clientmgr.c
+++ b/sound/core/seq/seq_clientmgr.c
@@ -1003,7 +1003,7 @@ static ssize_t snd_seq_write(struct file *file, const char __user *buf,
 {
 	struct snd_seq_client *client = file->private_data;
 	int written = 0, len;
-	int err = -EINVAL;
+	int err;
 	struct snd_seq_event event;
 
 	if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT))
@@ -1018,11 +1018,15 @@ static ssize_t snd_seq_write(struct file *file, const char __user *buf,
 
 	/* allocate the pool now if the pool is not allocated yet */ 
 	if (client->pool->size > 0 && !snd_seq_write_pool_allocated(client)) {
-		if (snd_seq_pool_init(client->pool) < 0)
+		mutex_lock(&client->ioctl_mutex);
+		err = snd_seq_pool_init(client->pool);
+		mutex_unlock(&client->ioctl_mutex);
+		if (err < 0)
 			return -ENOMEM;
 	}
 
 	/* only process whole events */
+	err = -EINVAL;
 	while (count >= sizeof(struct snd_seq_event)) {
 		/* Read in the event header from the user */
 		len = sizeof(event);

From fe0e58048f005fdce315eb4d185e5c160be4ac01 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Mon, 12 Feb 2018 14:13:59 +0100
Subject: [PATCH 592/676] Revert "mmc: meson-gx: include tx phase in the tuning
 process"

This reverts commit 0a44697627d17a66d7dc98f17aeca07ca79c5c20.

This commit was initially intended to fix problems with hs200 and hs400
on some boards, mainly the odroid-c2. The OC2 (Rev 0.2) I have performs
well in this modes, so I could not confirm these issues.

We've had several reports about the issues being still present on (some)
OC2, so apparently, this change does not do what it was supposed to do.
Maybe the eMMC signal quality is on the edge on the board. This may
explain the variability we see in term of stability, but this is just a
guess. Lowering the max_frequency to 100Mhz seems to do trick for those
affected by the issue

Worse, the commit created new issues (CRC errors and hangs) on other
boards, such as the kvim 1 and 2, the p200 or the libretech-cc.

According to amlogic, the Tx phase should not be tuned and left in its
default configuration, so it is best to just revert the commit.

Fixes: 0a44697627d1 ("mmc: meson-gx: include tx phase in the tuning process")
Cc: <stable@vger.kernel.org> # 4.14+
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/meson-gx-mmc.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c
index 22438ebfe4e62..4f972b879fe6f 100644
--- a/drivers/mmc/host/meson-gx-mmc.c
+++ b/drivers/mmc/host/meson-gx-mmc.c
@@ -717,22 +717,6 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode,
 static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode)
 {
 	struct meson_host *host = mmc_priv(mmc);
-	int ret;
-
-	/*
-	 * If this is the initial tuning, try to get a sane Rx starting
-	 * phase before doing the actual tuning.
-	 */
-	if (!mmc->doing_retune) {
-		ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
-
-		if (ret)
-			return ret;
-	}
-
-	ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk);
-	if (ret)
-		return ret;
 
 	return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk);
 }
@@ -763,9 +747,8 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		if (!IS_ERR(mmc->supply.vmmc))
 			mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd);
 
-		/* Reset phases */
+		/* Reset rx phase */
 		clk_set_phase(host->rx_clk, 0);
-		clk_set_phase(host->tx_clk, 270);
 
 		break;
 

From 118032be389009b07ecb5a03ffe219a89d421def Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Mon, 12 Feb 2018 21:13:44 +0100
Subject: [PATCH 593/676] mmc: bcm2835: Don't overwrite max frequency
 unconditionally

The optional DT parameter max-frequency could init the max bus frequency.
So take care of this, before setting the max bus frequency.

Fixes: 660fc733bd74 ("mmc: bcm2835: Add new driver for the sdhost controller.")
Signed-off-by: Phil Elwell <phil@raspberrypi.org>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Cc: <stable@vger.kernel.org> # 4.12+
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/bcm2835.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c
index 229dc18f0581b..768972af8b853 100644
--- a/drivers/mmc/host/bcm2835.c
+++ b/drivers/mmc/host/bcm2835.c
@@ -1265,7 +1265,8 @@ static int bcm2835_add_host(struct bcm2835_host *host)
 	char pio_limit_string[20];
 	int ret;
 
-	mmc->f_max = host->max_clk;
+	if (!mmc->f_max || mmc->f_max > host->max_clk)
+		mmc->f_max = host->max_clk;
 	mmc->f_min = host->max_clk / SDCDIV_MAX_CDIV;
 
 	mmc->max_busy_timeout = ~0 / (mmc->f_max / 1000);

From fdcc968a3b290407bcba9d4c90e2fba6d8d928f1 Mon Sep 17 00:00:00 2001
From: Jan-Marek Glogowski <glogow@fbihome.de>
Date: Wed, 14 Feb 2018 11:29:15 +0100
Subject: [PATCH 594/676] ALSA: hda/realtek: PCI quirk for Fujitsu U7x7

These laptops have a combined jack to attach headsets, the U727 on
the left, the U757 on the right, but a headsets microphone doesn't
work. Using hdajacksensetest I found that pin 0x19 changed the
present state when plugging the headset, in addition to 0x21, but
didn't have the correct configuration (shown as "Not connected").

So this sets the configuration to the same values as the headphone
pin 0x21 except for the device type microphone, which makes it
work correctly. With the patch the configured pins for U727 are

Pin 0x12 (Internal Mic, Mobile-In): present = No
Pin 0x14 (Internal Speaker): present = No
Pin 0x19 (Black Mic, Left side): present = No
Pin 0x1d (Internal Aux): present = No
Pin 0x21 (Black Headphone, Left side): present = No

Signed-off-by: Jan-Marek Glogowski <glogow@fbihome.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_realtek.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 32938ca8e5e36..ce28f7ce64e63 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -3465,6 +3465,19 @@ static void alc269_fixup_pincfg_no_hp_to_lineout(struct hda_codec *codec,
 		spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP;
 }
 
+static void alc269_fixup_pincfg_U7x7_headset_mic(struct hda_codec *codec,
+						 const struct hda_fixup *fix,
+						 int action)
+{
+	unsigned int cfg_headphone = snd_hda_codec_get_pincfg(codec, 0x21);
+	unsigned int cfg_headset_mic = snd_hda_codec_get_pincfg(codec, 0x19);
+
+	if (cfg_headphone && cfg_headset_mic == 0x411111f0)
+		snd_hda_codec_set_pincfg(codec, 0x19,
+			(cfg_headphone & ~AC_DEFCFG_DEVICE) |
+			(AC_JACK_MIC_IN << AC_DEFCFG_DEVICE_SHIFT));
+}
+
 static void alc269_fixup_hweq(struct hda_codec *codec,
 			       const struct hda_fixup *fix, int action)
 {
@@ -5373,6 +5386,7 @@ enum {
 	ALC269_FIXUP_LIFEBOOK_EXTMIC,
 	ALC269_FIXUP_LIFEBOOK_HP_PIN,
 	ALC269_FIXUP_LIFEBOOK_NO_HP_TO_LINEOUT,
+	ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC,
 	ALC269_FIXUP_AMIC,
 	ALC269_FIXUP_DMIC,
 	ALC269VB_FIXUP_AMIC,
@@ -5579,6 +5593,10 @@ static const struct hda_fixup alc269_fixups[] = {
 		.type = HDA_FIXUP_FUNC,
 		.v.func = alc269_fixup_pincfg_no_hp_to_lineout,
 	},
+	[ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC] = {
+		.type = HDA_FIXUP_FUNC,
+		.v.func = alc269_fixup_pincfg_U7x7_headset_mic,
+	},
 	[ALC269_FIXUP_AMIC] = {
 		.type = HDA_FIXUP_PINS,
 		.v.pins = (const struct hda_pintbl[]) {
@@ -6453,6 +6471,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x10cf, 0x159f, "Lifebook E780", ALC269_FIXUP_LIFEBOOK_NO_HP_TO_LINEOUT),
 	SND_PCI_QUIRK(0x10cf, 0x15dc, "Lifebook T731", ALC269_FIXUP_LIFEBOOK_HP_PIN),
 	SND_PCI_QUIRK(0x10cf, 0x1757, "Lifebook E752", ALC269_FIXUP_LIFEBOOK_HP_PIN),
+	SND_PCI_QUIRK(0x10cf, 0x1629, "Lifebook U7x7", ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC),
 	SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC),
 	SND_PCI_QUIRK(0x10ec, 0x10f2, "Intel Reference board", ALC700_FIXUP_INTEL_REFERENCE),
 	SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC),

From fa08a3b4eba59429cf7e241a7af089103e79160f Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 18 Dec 2017 17:21:23 +0100
Subject: [PATCH 595/676] virtio/s390: implement PM operations for virtio_ccw

Suspend/Resume to/from disk currently fails. Let us wire
up the necessary callbacks. This is mostly just forwarding
the requests to the virtio drivers. The only thing that
has to be done in virtio_ccw itself is to re-set the
virtio revision.

Suggested-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Message-Id: <20171207141102.70190-2-borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
[CH: merged <20171218083706.223836-1-borntraeger@de.ibm.com> to fix
!CONFIG_PM configs]
Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/s390/virtio/virtio_ccw.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index ba2e0856d22cd..8f5c1d7f751ae 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -1297,6 +1297,9 @@ static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event)
 		vcdev->device_lost = true;
 		rc = NOTIFY_DONE;
 		break;
+	case CIO_OPER:
+		rc = NOTIFY_OK;
+		break;
 	default:
 		rc = NOTIFY_DONE;
 		break;
@@ -1309,6 +1312,27 @@ static struct ccw_device_id virtio_ids[] = {
 	{},
 };
 
+#ifdef CONFIG_PM_SLEEP
+static int virtio_ccw_freeze(struct ccw_device *cdev)
+{
+	struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+
+	return virtio_device_freeze(&vcdev->vdev);
+}
+
+static int virtio_ccw_restore(struct ccw_device *cdev)
+{
+	struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
+	int ret;
+
+	ret = virtio_ccw_set_transport_rev(vcdev);
+	if (ret)
+		return ret;
+
+	return virtio_device_restore(&vcdev->vdev);
+}
+#endif
+
 static struct ccw_driver virtio_ccw_driver = {
 	.driver = {
 		.owner = THIS_MODULE,
@@ -1321,6 +1345,11 @@ static struct ccw_driver virtio_ccw_driver = {
 	.set_online = virtio_ccw_online,
 	.notify = virtio_ccw_cio_notify,
 	.int_class = IRQIO_VIR,
+#ifdef CONFIG_PM_SLEEP
+	.freeze = virtio_ccw_freeze,
+	.thaw = virtio_ccw_restore,
+	.restore = virtio_ccw_restore,
+#endif
 };
 
 static int __init pure_hex(char **cp, unsigned int *val, int min_digit,

From 7756f72ccd4359c6df61fc431cd3b5b0a8639837 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Tue, 30 Jan 2018 10:07:01 +0000
Subject: [PATCH 596/676] nvmet: Change return code of discard command if not
 supported

Execute discard command on block device that doesn't support it
should return success.
Returning internal error while using multi-path fails the path.

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/io-cmd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 0a4372a016f21..28bbdff4a88ba 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -105,10 +105,13 @@ static void nvmet_execute_flush(struct nvmet_req *req)
 static u16 nvmet_discard_range(struct nvmet_ns *ns,
 		struct nvme_dsm_range *range, struct bio **bio)
 {
-	if (__blkdev_issue_discard(ns->bdev,
+	int ret;
+
+	ret = __blkdev_issue_discard(ns->bdev,
 			le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
 			le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
-			GFP_KERNEL, 0, bio))
+			GFP_KERNEL, 0, bio);
+	if (ret && ret != -EOPNOTSUPP)
 		return NVME_SC_INTERNAL | NVME_SC_DNR;
 	return 0;
 }

From 8000d1fdb07e365e6565c2415aefdfed15413794 Mon Sep 17 00:00:00 2001
From: Nitzan Carmi <nitzanc@mellanox.com>
Date: Wed, 17 Jan 2018 11:01:14 +0000
Subject: [PATCH 597/676] nvme-rdma: fix sysfs invoked reset_ctrl error flow

When reset_controller that is invoked by sysfs fails,
it enters an error flow which practically removes the
nvme ctrl entirely (similar to delete_ctrl flow). It
causes the system to hang, since a sysfs attribute cannot
be unregistered by one of its own methods.

This can be fixed by calling delete_ctrl as a work rather
than sequential code. In addition, it should give the ctrl
a chance to recover using reconnection mechanism (consistant
with FC reset_ctrl error flow). Also, while we're here, return
suitable errno in case the reset ended with non live ctrl.

Signed-off-by: Nitzan Carmi <nitzanc@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 6 +++++-
 drivers/nvme/host/rdma.c | 7 ++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 52b3626fb64ed..0fe7ea35c2217 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -120,8 +120,12 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
 	int ret;
 
 	ret = nvme_reset_ctrl(ctrl);
-	if (!ret)
+	if (!ret) {
 		flush_work(&ctrl->reset_work);
+		if (ctrl->state != NVME_CTRL_LIVE)
+			ret = -ENETRESET;
+	}
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 5e2cc4f0d207f..3a51ed50eff24 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1784,11 +1784,8 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	return;
 
 out_fail:
-	dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_rdma_shutdown_ctrl(ctrl, true);
-	nvme_uninit_ctrl(&ctrl->ctrl);
-	nvme_put_ctrl(&ctrl->ctrl);
+	++ctrl->ctrl.nr_reconnects;
+	nvme_rdma_reconnect_or_remove(ctrl);
 }
 
 static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {

From 2ce77f6d8a9ae9ce6d80397d88bdceb84a2004cd Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 13 Feb 2018 13:14:09 +0000
Subject: [PATCH 598/676] arm64: proc: Set PTE_NG for table entries to avoid
 traversing them twice

When KASAN is enabled, the swapper page table contains many identical
mappings of the zero page, which can lead to a stall during boot whilst
the G -> nG code continually walks the same page table entries looking
for global mappings.

This patch sets the nG bit (bit 11, which is IGNORED) in table entries
after processing the subtree so we can easily skip them if we see them
a second time.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/proc.S | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 71baed7e592a4..c0af476172998 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -205,7 +205,8 @@ ENDPROC(idmap_cpu_replace_ttbr1)
 	dc	cvac, cur_\()\type\()p		// Ensure any existing dirty
 	dmb	sy				// lines are written back before
 	ldr	\type, [cur_\()\type\()p]	// loading the entry
-	tbz	\type, #0, next_\()\type	// Skip invalid entries
+	tbz	\type, #0, skip_\()\type	// Skip invalid and
+	tbnz	\type, #11, skip_\()\type	// non-global entries
 	.endm
 
 	.macro __idmap_kpti_put_pgtable_ent_ng, type
@@ -265,8 +266,9 @@ ENTRY(idmap_kpti_install_ng_mappings)
 	add	end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
 do_pgd:	__idmap_kpti_get_pgtable_ent	pgd
 	tbnz	pgd, #1, walk_puds
-	__idmap_kpti_put_pgtable_ent_ng	pgd
 next_pgd:
+	__idmap_kpti_put_pgtable_ent_ng	pgd
+skip_pgd:
 	add	cur_pgdp, cur_pgdp, #8
 	cmp	cur_pgdp, end_pgdp
 	b.ne	do_pgd
@@ -294,8 +296,9 @@ walk_puds:
 	add	end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
 do_pud:	__idmap_kpti_get_pgtable_ent	pud
 	tbnz	pud, #1, walk_pmds
-	__idmap_kpti_put_pgtable_ent_ng	pud
 next_pud:
+	__idmap_kpti_put_pgtable_ent_ng	pud
+skip_pud:
 	add	cur_pudp, cur_pudp, 8
 	cmp	cur_pudp, end_pudp
 	b.ne	do_pud
@@ -314,8 +317,9 @@ walk_pmds:
 	add	end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
 do_pmd:	__idmap_kpti_get_pgtable_ent	pmd
 	tbnz	pmd, #1, walk_ptes
-	__idmap_kpti_put_pgtable_ent_ng	pmd
 next_pmd:
+	__idmap_kpti_put_pgtable_ent_ng	pmd
+skip_pmd:
 	add	cur_pmdp, cur_pmdp, #8
 	cmp	cur_pmdp, end_pmdp
 	b.ne	do_pmd
@@ -333,7 +337,7 @@ walk_ptes:
 	add	end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
 do_pte:	__idmap_kpti_get_pgtable_ent	pte
 	__idmap_kpti_put_pgtable_ent_ng	pte
-next_pte:
+skip_pte:
 	add	cur_ptep, cur_ptep, #8
 	cmp	cur_ptep, end_ptep
 	b.ne	do_pte

From 405cacc947f7b58969b2a8ab1568c2d98b245308 Mon Sep 17 00:00:00 2001
From: Hans de Goede <j.w.r.degoede@gmail.com>
Date: Wed, 20 Dec 2017 11:50:17 +0100
Subject: [PATCH 599/676] drm/i915/vlv: Add cdclk workaround for DSI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At least on the Chuwi Vi8 (non pro/plus) the LCD panel will show an image
shifted aprox. 20% to the left (with wraparound) and sometimes also wrong
colors, showing that the panel controller is starting with sampling the
datastream somewhere mid-line. This happens after the first blanking and
re-init of the panel.

After looking at drm.debug output I noticed that initially we inherit the
cdclk of 333333 KHz set by the GOP, but after the re-init we picked 266667
KHz, which turns out to be the cause of this problem, a quick hack to hard
code the cdclk to 333333 KHz makes the problem go away.

I've tested this on various Bay Trail devices, to make sure this not does
cause regressions on other devices and the higher cdclk does not cause
any problems on the following devices:
-GP-electronic T701      1024x600   333333 KHz cdclk after this patch
-PEAQ C1010              1920x1200  333333 KHz cdclk after this patch
-PoV mobii-wintab-800w    800x1280  333333 KHz cdclk after this patch
-Asus Transformer-T100TA 1368x768   320000 KHz cdclk after this patch

Also interesting wrt this is the comment in vlv_calc_cdclk about the
existing workaround to avoid 200 Mhz as clock because that causes issues
in some cases.

This commit extends the "do not use 200 Mhz" workaround with an extra
check to require atleast 320000 KHz (avoiding 266667 KHz) when a DSI
panel is active.

Changes in v2:
-Change the commit message and the code comment to not treat the GOP as
 a reference, the GOP should not be treated as a reference

Acked-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20171220105017.11259-1-hdegoede@redhat.com
(cherry picked from commit c8dae55a8ced625038d52d26e48273707fab2688)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/intel_cdclk.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c
index 5dc118f26b51b..1704c8897afd0 100644
--- a/drivers/gpu/drm/i915/intel_cdclk.c
+++ b/drivers/gpu/drm/i915/intel_cdclk.c
@@ -1952,6 +1952,14 @@ int intel_crtc_compute_min_cdclk(const struct intel_crtc_state *crtc_state)
 	if (crtc_state->has_audio && INTEL_GEN(dev_priv) >= 9)
 		min_cdclk = max(2 * 96000, min_cdclk);
 
+	/*
+	 * On Valleyview some DSI panels lose (v|h)sync when the clock is lower
+	 * than 320000KHz.
+	 */
+	if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DSI) &&
+	    IS_VALLEYVIEW(dev_priv))
+		min_cdclk = max(320000, min_cdclk);
+
 	if (min_cdclk > dev_priv->max_cdclk_freq) {
 		DRM_DEBUG_KMS("required cdclk (%d kHz) exceeds max (%d kHz)\n",
 			      min_cdclk, dev_priv->max_cdclk_freq);

From 7928e9bb09dc7f108a1a2b589ef1c7b86843569c Mon Sep 17 00:00:00 2001
From: Hans de Goede <j.w.r.degoede@gmail.com>
Date: Wed, 14 Feb 2018 09:21:49 +0100
Subject: [PATCH 600/676] drm/i915: Add intel_bios_cleanup() function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an intel_bios_cleanup() function to act as counterpart of
intel_bios_init() and move the cleanup of vbt related resources there,
putting it in the same file as the allocation.

Changed in v2:
-While touching the code anyways, remove the unnecessary:
 if (dev_priv->vbt.child_dev) done before kfree(dev_priv->vbt.child_dev)

Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-1-hdegoede@redhat.com
(cherry picked from commit 785f076b3ba781804f2b22b347b4431e3efb0ab3)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c   | 14 +-------------
 drivers/gpu/drm/i915/i915_drv.h   |  1 +
 drivers/gpu/drm/i915/intel_bios.c | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 173d0095e3b21..2f5209de03915 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1433,19 +1433,7 @@ void i915_driver_unload(struct drm_device *dev)
 
 	intel_modeset_cleanup(dev);
 
-	/*
-	 * free the memory space allocated for the child device
-	 * config parsed from VBT
-	 */
-	if (dev_priv->vbt.child_dev && dev_priv->vbt.child_dev_num) {
-		kfree(dev_priv->vbt.child_dev);
-		dev_priv->vbt.child_dev = NULL;
-		dev_priv->vbt.child_dev_num = 0;
-	}
-	kfree(dev_priv->vbt.sdvo_lvds_vbt_mode);
-	dev_priv->vbt.sdvo_lvds_vbt_mode = NULL;
-	kfree(dev_priv->vbt.lfp_lvds_vbt_mode);
-	dev_priv->vbt.lfp_lvds_vbt_mode = NULL;
+	intel_bios_cleanup(dev_priv);
 
 	vga_switcheroo_unregister_client(pdev);
 	vga_client_register(pdev, NULL, NULL, NULL);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a42deebedb0f1..d2fc519bc592e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3657,6 +3657,7 @@ extern void intel_i2c_reset(struct drm_i915_private *dev_priv);
 
 /* intel_bios.c */
 void intel_bios_init(struct drm_i915_private *dev_priv);
+void intel_bios_cleanup(struct drm_i915_private *dev_priv);
 bool intel_bios_is_valid_vbt(const void *buf, size_t size);
 bool intel_bios_is_tv_present(struct drm_i915_private *dev_priv);
 bool intel_bios_is_lvds_present(struct drm_i915_private *dev_priv, u8 *i2c_pin);
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index f7f771749e480..57db816f962be 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1588,6 +1588,21 @@ void intel_bios_init(struct drm_i915_private *dev_priv)
 		pci_unmap_rom(pdev, bios);
 }
 
+/**
+ * intel_bios_cleanup - Free any resources allocated by intel_bios_init()
+ * @dev_priv: i915 device instance
+ */
+void intel_bios_cleanup(struct drm_i915_private *dev_priv)
+{
+	kfree(dev_priv->vbt.child_dev);
+	dev_priv->vbt.child_dev = NULL;
+	dev_priv->vbt.child_dev_num = 0;
+	kfree(dev_priv->vbt.sdvo_lvds_vbt_mode);
+	dev_priv->vbt.sdvo_lvds_vbt_mode = NULL;
+	kfree(dev_priv->vbt.lfp_lvds_vbt_mode);
+	dev_priv->vbt.lfp_lvds_vbt_mode = NULL;
+}
+
 /**
  * intel_bios_is_tv_present - is integrated TV present in VBT
  * @dev_priv:	i915 device instance

From ed0545a7fbb5241a27f45a084dd71522cdaea5b9 Mon Sep 17 00:00:00 2001
From: Hans de Goede <j.w.r.degoede@gmail.com>
Date: Wed, 14 Feb 2018 09:21:50 +0100
Subject: [PATCH 601/676] drm/i915: Free memdup-ed DSI VBT data structures on
 driver_unload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make intel_bios_cleanup function free the DSI VBT data structures which
are memdup-ed by parse_mipi_config() and parse_mipi_sequence().

Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-2-hdegoede@redhat.com
(cherry picked from commit e1b86c85f6c2029c31dba99823b6f3d9e15eaacd)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/intel_bios.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 57db816f962be..9a9b62c93889f 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -1601,6 +1601,12 @@ void intel_bios_cleanup(struct drm_i915_private *dev_priv)
 	dev_priv->vbt.sdvo_lvds_vbt_mode = NULL;
 	kfree(dev_priv->vbt.lfp_lvds_vbt_mode);
 	dev_priv->vbt.lfp_lvds_vbt_mode = NULL;
+	kfree(dev_priv->vbt.dsi.data);
+	dev_priv->vbt.dsi.data = NULL;
+	kfree(dev_priv->vbt.dsi.pps);
+	dev_priv->vbt.dsi.pps = NULL;
+	kfree(dev_priv->vbt.dsi.config);
+	dev_priv->vbt.dsi.config = NULL;
 }
 
 /**

From ee622fe757f6de612dad0f01805eea815a5b3025 Mon Sep 17 00:00:00 2001
From: Hans de Goede <j.w.r.degoede@gmail.com>
Date: Wed, 14 Feb 2018 09:21:51 +0100
Subject: [PATCH 602/676] drm/i915: Fix DSI panels with v1 MIPI sequences
 without a DEASSERT sequence v3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far models of the Dell Venue 8 Pro, with a panel with MIPI panel
index = 3, one of which has been kindly provided to me by Jan Brummer,
where not working with the i915 driver, giving a black screen on the
first modeset.

The problem with at least these Dells is that their VBT defines a MIPI
ASSERT sequence, but not a DEASSERT sequence. Instead they DEASSERT the
reset in their INIT_OTP sequence, but the deassert must be done before
calling intel_dsi_device_ready(), so that is too late.

Simply doing the INIT_OTP sequence earlier is not enough to fix this,
because the INIT_OTP sequence also sends various MIPI packets to the
panel, which can only happen after calling intel_dsi_device_ready().

This commit fixes this by splitting the INIT_OTP sequence into everything
before the first DSI packet and everything else, including the first DSI
packet. The first part (everything before the first DSI packet) is then
used as deassert sequence.

Changed in v2:
-Split the init OTP sequence into a deassert reset and the actual init
 OTP sequence, instead of calling it earlier and then having the first
 mipi_exec_send_packet() call call intel_dsi_device_ready().

Changes in v3:
-Move the whole shebang to intel_bios.c

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=82880
References: https://bugs.freedesktop.org/show_bug.cgi?id=101205
Cc: Jan-Michael Brummer <jan.brummer@tabos.org>
Reported-by: Jan-Michael Brummer <jan.brummer@tabos.org>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-3-hdegoede@redhat.com
(cherry picked from commit fb38e7ade9af4f3e96f5916c3f6f19bfc7d5f961)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h   |  1 +
 drivers/gpu/drm/i915/intel_bios.c | 84 +++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d2fc519bc592e..d307429a5ae0a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1349,6 +1349,7 @@ struct intel_vbt_data {
 		u32 size;
 		u8 *data;
 		const u8 *sequence[MIPI_SEQ_MAX];
+		u8 *deassert_seq; /* Used by fixup_mipi_sequences() */
 	} dsi;
 
 	int crt_ddc_pin;
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 9a9b62c93889f..b49a2df444301 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -947,6 +947,86 @@ static int goto_next_sequence_v3(const u8 *data, int index, int total)
 	return 0;
 }
 
+/*
+ * Get len of pre-fixed deassert fragment from a v1 init OTP sequence,
+ * skip all delay + gpio operands and stop at the first DSI packet op.
+ */
+static int get_init_otp_deassert_fragment_len(struct drm_i915_private *dev_priv)
+{
+	const u8 *data = dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP];
+	int index, len;
+
+	if (WARN_ON(!data || dev_priv->vbt.dsi.seq_version != 1))
+		return 0;
+
+	/* index = 1 to skip sequence byte */
+	for (index = 1; data[index] != MIPI_SEQ_ELEM_END; index += len) {
+		switch (data[index]) {
+		case MIPI_SEQ_ELEM_SEND_PKT:
+			return index == 1 ? 0 : index;
+		case MIPI_SEQ_ELEM_DELAY:
+			len = 5; /* 1 byte for operand + uint32 */
+			break;
+		case MIPI_SEQ_ELEM_GPIO:
+			len = 3; /* 1 byte for op, 1 for gpio_nr, 1 for value */
+			break;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Some v1 VBT MIPI sequences do the deassert in the init OTP sequence.
+ * The deassert must be done before calling intel_dsi_device_ready, so for
+ * these devices we split the init OTP sequence into a deassert sequence and
+ * the actual init OTP part.
+ */
+static void fixup_mipi_sequences(struct drm_i915_private *dev_priv)
+{
+	u8 *init_otp;
+	int len;
+
+	/* Limit this to VLV for now. */
+	if (!IS_VALLEYVIEW(dev_priv))
+		return;
+
+	/* Limit this to v1 vid-mode sequences */
+	if (dev_priv->vbt.dsi.config->is_cmd_mode ||
+	    dev_priv->vbt.dsi.seq_version != 1)
+		return;
+
+	/* Only do this if there are otp and assert seqs and no deassert seq */
+	if (!dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] ||
+	    !dev_priv->vbt.dsi.sequence[MIPI_SEQ_ASSERT_RESET] ||
+	    dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET])
+		return;
+
+	/* The deassert-sequence ends at the first DSI packet */
+	len = get_init_otp_deassert_fragment_len(dev_priv);
+	if (!len)
+		return;
+
+	DRM_DEBUG_KMS("Using init OTP fragment to deassert reset\n");
+
+	/* Copy the fragment, update seq byte and terminate it */
+	init_otp = (u8 *)dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP];
+	dev_priv->vbt.dsi.deassert_seq = kmemdup(init_otp, len + 1, GFP_KERNEL);
+	if (!dev_priv->vbt.dsi.deassert_seq)
+		return;
+	dev_priv->vbt.dsi.deassert_seq[0] = MIPI_SEQ_DEASSERT_RESET;
+	dev_priv->vbt.dsi.deassert_seq[len] = MIPI_SEQ_ELEM_END;
+	/* Use the copy for deassert */
+	dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET] =
+		dev_priv->vbt.dsi.deassert_seq;
+	/* Replace the last byte of the fragment with init OTP seq byte */
+	init_otp[len - 1] = MIPI_SEQ_INIT_OTP;
+	/* And make MIPI_MIPI_SEQ_INIT_OTP point to it */
+	dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] = init_otp + len - 1;
+}
+
 static void
 parse_mipi_sequence(struct drm_i915_private *dev_priv,
 		    const struct bdb_header *bdb)
@@ -1016,6 +1096,8 @@ parse_mipi_sequence(struct drm_i915_private *dev_priv,
 	dev_priv->vbt.dsi.size = seq_size;
 	dev_priv->vbt.dsi.seq_version = sequence->version;
 
+	fixup_mipi_sequences(dev_priv);
+
 	DRM_DEBUG_DRIVER("MIPI related VBT parsing complete\n");
 	return;
 
@@ -1607,6 +1689,8 @@ void intel_bios_cleanup(struct drm_i915_private *dev_priv)
 	dev_priv->vbt.dsi.pps = NULL;
 	kfree(dev_priv->vbt.dsi.config);
 	dev_priv->vbt.dsi.config = NULL;
+	kfree(dev_priv->vbt.dsi.deassert_seq);
+	dev_priv->vbt.dsi.deassert_seq = NULL;
 }
 
 /**

From c134f0d57a47b7f8704dee1cefc246f9471f3e80 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Wed, 14 Feb 2018 14:27:06 +1100
Subject: [PATCH 603/676] powerpc: Expose TSCR via sysfs only on powernv

The TSCR can only be accessed in hypervisor mode.

Fixes: 88b5e12eeb11 ("powerpc: Expose TSCR via sysfs")
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/sysfs.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 5a8bfee6e1877..04d0bbd7a1dd0 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -788,7 +788,8 @@ static int register_cpu_online(unsigned int cpu)
 	if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
 		device_create_file(s, &dev_attr_pir);
 
-	if (cpu_has_feature(CPU_FTR_ARCH_206))
+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		!firmware_has_feature(FW_FEATURE_LPAR))
 		device_create_file(s, &dev_attr_tscr);
 #endif /* CONFIG_PPC64 */
 
@@ -873,7 +874,8 @@ static int unregister_cpu_online(unsigned int cpu)
 	if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
 		device_remove_file(s, &dev_attr_pir);
 
-	if (cpu_has_feature(CPU_FTR_ARCH_206))
+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+		!firmware_has_feature(FW_FEATURE_LPAR))
 		device_remove_file(s, &dev_attr_tscr);
 #endif /* CONFIG_PPC64 */
 

From 8e036c8d30a2cd9d8fc7442fbf6824e0a3e986e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Tue, 13 Feb 2018 09:47:12 +0100
Subject: [PATCH 604/676] powerpc/xive: Use hw CPU ids when configuring the CPU
 queues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CPU event notification queues on sPAPR should be configured using
a hardware CPU identifier.

The problem did not show up on the Power Hypervisor because pHyp
supports 8 threads per core which keeps CPU number contiguous. This is
not the case on all sPAPR virtual machines, some use SMT=1.

Also improve error logging by adding the CPU number.

Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/xive/spapr.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c
index d9c4c93660491..091f1d0d0af19 100644
--- a/arch/powerpc/sysdev/xive/spapr.c
+++ b/arch/powerpc/sysdev/xive/spapr.c
@@ -356,7 +356,8 @@ static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio,
 
 	rc = plpar_int_get_queue_info(0, target, prio, &esn_page, &esn_size);
 	if (rc) {
-		pr_err("Error %lld getting queue info prio %d\n", rc, prio);
+		pr_err("Error %lld getting queue info CPU %d prio %d\n", rc,
+		       target, prio);
 		rc = -EIO;
 		goto fail;
 	}
@@ -370,7 +371,8 @@ static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio,
 	/* Configure and enable the queue in HW */
 	rc = plpar_int_set_queue_config(flags, target, prio, qpage_phys, order);
 	if (rc) {
-		pr_err("Error %lld setting queue for prio %d\n", rc, prio);
+		pr_err("Error %lld setting queue for CPU %d prio %d\n", rc,
+		       target, prio);
 		rc = -EIO;
 	} else {
 		q->qpage = qpage;
@@ -389,8 +391,8 @@ static int xive_spapr_setup_queue(unsigned int cpu, struct xive_cpu *xc,
 	if (IS_ERR(qpage))
 		return PTR_ERR(qpage);
 
-	return xive_spapr_configure_queue(cpu, q, prio, qpage,
-					  xive_queue_shift);
+	return xive_spapr_configure_queue(get_hard_smp_processor_id(cpu),
+					  q, prio, qpage, xive_queue_shift);
 }
 
 static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc,
@@ -399,10 +401,12 @@ static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc,
 	struct xive_q *q = &xc->queue[prio];
 	unsigned int alloc_order;
 	long rc;
+	int hw_cpu = get_hard_smp_processor_id(cpu);
 
-	rc = plpar_int_set_queue_config(0, cpu, prio, 0, 0);
+	rc = plpar_int_set_queue_config(0, hw_cpu, prio, 0, 0);
 	if (rc)
-		pr_err("Error %ld setting queue for prio %d\n", rc, prio);
+		pr_err("Error %ld setting queue for CPU %d prio %d\n", rc,
+		       hw_cpu, prio);
 
 	alloc_order = xive_alloc_order(xive_queue_shift);
 	free_pages((unsigned long)q->qpage, alloc_order);

From e7bde88cdb4f0e432398a7d29ca2a15d2c18952a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 13 Feb 2018 17:45:11 +1000
Subject: [PATCH 605/676] powerpc/powernv: IMC fix out of bounds memory access
 at shutdown

The OPAL IMC driver's shutdown handler disables nest PMU counters by
walking nodes and taking the first CPU out of their cpumask, which is
used to index into the paca (get_hard_smp_processor_id()). This does
not always do the right thing, and in particular for CPU-less nodes it
returns NR_CPUS and that overruns the paca and dereferences random
memory.

Fix it by being more careful about checking returned CPU, and only
using online CPUs. It's not clear this shutdown code makes sense after
commit 885dcd709b ("powerpc/perf: Add nest IMC PMU support"), but this
should not make things worse

Currently the bug causes us to call OPAL with a junk CPU number. A
separate patch in development to change the way pacas are allocated
escalates this bug into a crash:

  Unable to handle kernel paging request for data at address 0x2a21af1eeb000076
  Faulting instruction address: 0xc0000000000a5468
  Oops: Kernel access of bad area, sig: 11 [#1]
  ...
  NIP opal_imc_counters_shutdown+0x148/0x1d0
  LR  opal_imc_counters_shutdown+0x134/0x1d0
  Call Trace:
   opal_imc_counters_shutdown+0x134/0x1d0 (unreliable)
   platform_drv_shutdown+0x44/0x60
   device_shutdown+0x1f8/0x350
   kernel_restart_prepare+0x54/0x70
   kernel_restart+0x28/0xc0
   SyS_reboot+0x1d0/0x2c0
   system_call+0x58/0x6c

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-imc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index dd4c9b8b8a81e..f6f55ab4980e7 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -199,9 +199,11 @@ static void disable_nest_pmu_counters(void)
 	const struct cpumask *l_cpumask;
 
 	get_online_cpus();
-	for_each_online_node(nid) {
+	for_each_node_with_cpus(nid) {
 		l_cpumask = cpumask_of_node(nid);
-		cpu = cpumask_first(l_cpumask);
+		cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
+		if (cpu >= nr_cpu_ids)
+			continue;
 		opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
 				       get_hard_smp_processor_id(cpu));
 	}

From c1e150ceb61e4a585bad156da15c33bfe89f5858 Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe@baylibre.com>
Date: Wed, 14 Feb 2018 12:17:47 +0000
Subject: [PATCH 606/676] powerpc/pseries: Add empty
 update_numa_cpu_lookup_table() for NUMA=n
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When CONFIG_NUMA is not set, the build fails with:

  arch/powerpc/platforms/pseries/hotplug-cpu.c:335:4:
  error: déclaration implicite de la fonction « update_numa_cpu_lookup_table »

So we have to add update_numa_cpu_lookup_table() as an empty function
when CONFIG_NUMA is not set.

Fixes: 1d9a090783be ("powerpc/numa: Invalidate numa_cpu_lookup_table on cpu remove")
Signed-off-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/topology.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5932481109020..9f421641a35c8 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -81,6 +81,9 @@ static inline int numa_update_cpu_topology(bool cpus_locked)
 {
 	return 0;
 }
+
+static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
+
 #endif /* CONFIG_NUMA */
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)

From 4105c69703cdeba76f384b901712c9397b04e9c2 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Tue, 13 Feb 2018 09:13:21 +0100
Subject: [PATCH 607/676] selftests/x86: Do not rely on "int $0x80" in
 single_step_syscall.c

On 64-bit builds, we should not rely on "int $0x80" working (it only does if
CONFIG_IA32_EMULATION=y is enabled). To keep the "Set TF and check int80"
test running on 64-bit installs with CONFIG_IA32_EMULATION=y enabled, build
this test only if we can also build 32-bit binaries (which should be a
good approximation for that).

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180211111013.16888-5-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/Makefile              | 2 ++
 tools/testing/selftests/x86/single_step_syscall.c | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 91fbfa8fdc150..73b8ef665c987 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -30,11 +30,13 @@ CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
 ifeq ($(CAN_BUILD_I386),1)
 all: all_32
 TEST_PROGS += $(BINARIES_32)
+EXTRA_CFLAGS += -DCAN_BUILD_32
 endif
 
 ifeq ($(CAN_BUILD_X86_64),1)
 all: all_64
 TEST_PROGS += $(BINARIES_64)
+EXTRA_CFLAGS += -DCAN_BUILD_64
 endif
 
 all_32: $(BINARIES_32)
diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c
index a48da95c18fdf..ddfdd635de16c 100644
--- a/tools/testing/selftests/x86/single_step_syscall.c
+++ b/tools/testing/selftests/x86/single_step_syscall.c
@@ -119,7 +119,9 @@ static void check_result(void)
 
 int main()
 {
+#ifdef CAN_BUILD_32
 	int tmp;
+#endif
 
 	sethandler(SIGTRAP, sigtrap, 0);
 
@@ -139,12 +141,13 @@ int main()
 		      : : "c" (post_nop) : "r11");
 	check_result();
 #endif
-
+#ifdef CAN_BUILD_32
 	printf("[RUN]\tSet TF and check int80\n");
 	set_eflags(get_eflags() | X86_EFLAGS_TF);
 	asm volatile ("int $0x80" : "=a" (tmp) : "a" (SYS_getpid)
 			: INT80_CLOBBERS);
 	check_result();
+#endif
 
 	/*
 	 * This test is particularly interesting if fast syscalls use

From 9279ddf23ce78ff2676e8e8e19fec0f022c26d04 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Tue, 13 Feb 2018 09:15:19 +0100
Subject: [PATCH 608/676] selftests/x86: Disable tests requiring 32-bit support
 on pure 64-bit systems

The ldt_gdt and ptrace_syscall selftests, even in their 64-bit variant, use
hard-coded 32-bit syscall numbers and call "int $0x80".

This will fail on 64-bit systems with CONFIG_IA32_EMULATION=y disabled.

Therefore, do not build these tests if we cannot build 32-bit binaries
(which should be a good approximation for CONFIG_IA32_EMULATION=y being enabled).

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kselftest@vger.kernel.org
Cc: shuah@kernel.org
Link: http://lkml.kernel.org/r/20180211111013.16888-6-linux@dominikbrodowski.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/Makefile | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 73b8ef665c987..aa6e2d7f6a1fd 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,16 +5,26 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
-			check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
+UNAME_M := $(shell uname -m)
+CAN_BUILD_I386 := $(shell ./check_cc.sh $(CC) trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
+
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
+			check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
 			protection_keys test_vdso test_vsyscall
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			vdso_restorer
 TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
+# Some selftests require 32bit support enabled also on 64bit systems
+TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
 
-TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) $(TARGETS_C_32BIT_NEEDED)
 TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),11)
+TARGETS_C_64BIT_ALL += $(TARGETS_C_32BIT_NEEDED)
+endif
+
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
 BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
 
@@ -23,10 +33,6 @@ BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
 
 CFLAGS := -O2 -g -std=gnu99 -pthread -Wall -no-pie
 
-UNAME_M := $(shell uname -m)
-CAN_BUILD_I386 := $(shell ./check_cc.sh $(CC) trivial_32bit_program.c -m32)
-CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
-
 ifeq ($(CAN_BUILD_I386),1)
 all: all_32
 TEST_PROGS += $(BINARIES_32)

From fe24e27128252c230a34a6c628da2bf1676781ea Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 8 Feb 2018 17:09:25 -0600
Subject: [PATCH 609/676] objtool: Fix segfault in ignore_unreachable_insn()

Peter Zijlstra's patch for converting WARN() to use UD2 triggered a
bunch of false "unreachable instruction" warnings, which then triggered
a seg fault in ignore_unreachable_insn().

The seg fault happened when it tried to dereference a NULL 'insn->func'
pointer.  Thanks to static_cpu_has(), some functions can jump to a
non-function area in the .altinstr_aux section.  That breaks
ignore_unreachable_insn()'s assumption that it's always inside the
original function.

Make sure ignore_unreachable_insn() only follows jumps within the
current function.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kbuild test robot <fengguang.wu@intel.com>
Link: http://lkml.kernel.org/r/bace77a60d5af9b45eddb8f8fb9c776c8de657ef.1518130694.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/objtool/check.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 2e458eb45586c..c7fb5c2392ee6 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1935,13 +1935,19 @@ static bool ignore_unreachable_insn(struct instruction *insn)
 		if (is_kasan_insn(insn) || is_ubsan_insn(insn))
 			return true;
 
-		if (insn->type == INSN_JUMP_UNCONDITIONAL && insn->jump_dest) {
-			insn = insn->jump_dest;
-			continue;
+		if (insn->type == INSN_JUMP_UNCONDITIONAL) {
+			if (insn->jump_dest &&
+			    insn->jump_dest->func == insn->func) {
+				insn = insn->jump_dest;
+				continue;
+			}
+
+			break;
 		}
 
 		if (insn->offset + insn->len >= insn->func->offset + insn->func->len)
 			break;
+
 		insn = list_next_entry(insn, list);
 	}
 

From 2b5db66862b95532cb6cca8165ae6eb73633cf85 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 8 Feb 2018 17:09:26 -0600
Subject: [PATCH 610/676] x86/debug, objtool: Annotate WARN()-related UD2 as
 reachable

By default, objtool assumes that a UD2 is a dead end.  This is mainly
because GCC 7+ sometimes inserts a UD2 when it detects a divide-by-zero
condition.

Now that WARN() is moving back to UD2, annotate the code after it as
reachable so objtool can follow the code flow.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kbuild test robot <fengguang.wu@intel.com>
Link: http://lkml.kernel.org/r/0e483379275a42626ba8898117f918e1bf661e40.1518130694.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bug.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 34d99af439944..71e6f4bf9161d 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -77,7 +77,11 @@ do {								\
 	unreachable();						\
 } while (0)
 
-#define __WARN_FLAGS(flags)	_BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
+#define __WARN_FLAGS(flags)					\
+do {								\
+	_BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags));		\
+	annotate_reachable();					\
+} while (0)
 
 #include <asm-generic/bug.h>
 

From 3b3a371cc9bc980429baabe0a8e5f307f3d1f463 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Feb 2018 13:16:59 +0100
Subject: [PATCH 611/676] x86/debug: Use UD2 for WARN()

Since the Intel SDM added an ModR/M byte to UD0 and binutils followed
that specification, we now cannot disassemble our kernel anymore.

This now means Intel and AMD disagree on the encoding of UD0. And instead
of playing games with additional bytes that are valid ModR/M and single
byte instructions (0xd6 for instance), simply use UD2 for both WARN() and
BUG().

Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20180208194406.GD25181@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/bug.h | 15 ++++++---------
 arch/x86/kernel/traps.c    |  2 +-
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 71e6f4bf9161d..6804d66427673 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -5,23 +5,20 @@
 #include <linux/stringify.h>
 
 /*
- * Since some emulators terminate on UD2, we cannot use it for WARN.
- * Since various instruction decoders disagree on the length of UD1,
- * we cannot use it either. So use UD0 for WARN.
+ * Despite that some emulators terminate on UD2, we use it for WARN().
  *
- * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas
- *  our kernel decoder thinks it takes a ModRM byte, which seems consistent
- *  with various things like the Intel SDM instruction encoding rules)
+ * Since various instruction decoders/specs disagree on the encoding of
+ * UD0/UD1.
  */
 
-#define ASM_UD0		".byte 0x0f, 0xff"
+#define ASM_UD0		".byte 0x0f, 0xff" /* + ModRM (for Intel) */
 #define ASM_UD1		".byte 0x0f, 0xb9" /* + ModRM */
 #define ASM_UD2		".byte 0x0f, 0x0b"
 
 #define INSN_UD0	0xff0f
 #define INSN_UD2	0x0b0f
 
-#define LEN_UD0		2
+#define LEN_UD2		2
 
 #ifdef CONFIG_GENERIC_BUG
 
@@ -79,7 +76,7 @@ do {								\
 
 #define __WARN_FLAGS(flags)					\
 do {								\
-	_BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags));		\
+	_BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags));		\
 	annotate_reachable();					\
 } while (0)
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 446c9ef8cfc32..3d9b2308e7fad 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -181,7 +181,7 @@ int fixup_bug(struct pt_regs *regs, int trapnr)
 		break;
 
 	case BUG_TRAP_TYPE_WARN:
-		regs->ip += LEN_UD0;
+		regs->ip += LEN_UD2;
 		return 1;
 	}
 

From be3233fbfcb8f5acb6e3bcd0895c3ef9e100d470 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 6 Feb 2018 18:22:40 -0800
Subject: [PATCH 612/676] x86/speculation: Fix up array_index_nospec_mask() asm
 constraint

Allow the compiler to handle @size as an immediate value or memory
directly rather than allocating a register.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/151797010204.1289.1510000292250184993.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/barrier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 30d4061460164..e1259f043ae99 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -40,7 +40,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 
 	asm ("cmp %1,%2; sbb %0,%0;"
 			:"=r" (mask)
-			:"r"(size),"r" (index)
+			:"g"(size),"r" (index)
 			:"cc");
 	return mask;
 }

From 8fa80c503b484ddc1abbd10c7cb2ab81f3824a50 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 5 Feb 2018 14:16:06 +0000
Subject: [PATCH 613/676] nospec: Move array_index_nospec() parameter checking
 into separate macro

For architectures providing their own implementation of
array_index_mask_nospec() in asm/barrier.h, attempting to use WARN_ONCE() to
complain about out-of-range parameters using WARN_ON() results in a mess
of mutually-dependent include files.

Rather than unpick the dependencies, simply have the core code in nospec.h
perform the checking for us.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1517840166-15399-1-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nospec.h | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index b99bced39ac2f..fbc98e2c8228d 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -19,20 +19,6 @@
 static inline unsigned long array_index_mask_nospec(unsigned long index,
 						    unsigned long size)
 {
-	/*
-	 * Warn developers about inappropriate array_index_nospec() usage.
-	 *
-	 * Even if the CPU speculates past the WARN_ONCE branch, the
-	 * sign bit of @index is taken into account when generating the
-	 * mask.
-	 *
-	 * This warning is compiled out when the compiler can infer that
-	 * @index and @size are less than LONG_MAX.
-	 */
-	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
-			"array_index_nospec() limited to range of [0, LONG_MAX]\n"))
-		return 0;
-
 	/*
 	 * Always calculate and emit the mask even if the compiler
 	 * thinks the mask is not needed. The compiler does not take
@@ -43,6 +29,26 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 }
 #endif
 
+/*
+ * Warn developers about inappropriate array_index_nospec() usage.
+ *
+ * Even if the CPU speculates past the WARN_ONCE branch, the
+ * sign bit of @index is taken into account when generating the
+ * mask.
+ *
+ * This warning is compiled out when the compiler can infer that
+ * @index and @size are less than LONG_MAX.
+ */
+#define array_index_mask_nospec_check(index, size)				\
+({										\
+	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,			\
+	    "array_index_nospec() limited to range of [0, LONG_MAX]\n"))	\
+		_mask = 0;							\
+	else									\
+		_mask = array_index_mask_nospec(index, size);			\
+	_mask;									\
+})
+
 /*
  * array_index_nospec - sanitize an array index after a bounds check
  *
@@ -61,7 +67,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 ({									\
 	typeof(index) _i = (index);					\
 	typeof(size) _s = (size);					\
-	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
+	unsigned long _mask = array_index_mask_nospec_check(_i, _s);	\
 									\
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\

From ea00f301285ea2f07393678cd2b6057878320c9d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 13 Feb 2018 14:28:19 +0100
Subject: [PATCH 614/676] x86/speculation: Add <asm/msr-index.h> dependency

Joe Konno reported a compile failure resulting from using an MSR
without inclusion of <asm/msr-index.h>, and while the current code builds
fine (by accident) this needs fixing for future patches.

Reported-by: Joe Konno <joe.konno@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bp@alien8.de
Cc: dan.j.williams@intel.com
Cc: dave.hansen@linux.intel.com
Cc: dwmw2@infradead.org
Cc: dwmw@amazon.co.uk
Cc: gregkh@linuxfoundation.org
Cc: hpa@zytor.com
Cc: jpoimboe@redhat.com
Cc: linux-tip-commits@vger.kernel.org
Cc: luto@kernel.org
Fixes: 20ffa1caecca ("x86/speculation: Add basic IBPB (Indirect Branch Prediction Barrier) support")
Link: http://lkml.kernel.org/r/20180213132819.GJ25201@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/nospec-branch.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 300cc159b4a0a..76b058533e473 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -6,6 +6,7 @@
 #include <asm/alternative.h>
 #include <asm/alternative-asm.h>
 #include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
 
 #ifdef __ASSEMBLY__
 

From 1299ef1d8870d2d9f09a5aadf2f8b2c887c2d033 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 31 Jan 2018 08:03:10 -0800
Subject: [PATCH 615/676] x86/mm: Rename flush_tlb_single() and flush_tlb_one()
 to __flush_tlb_one_[user|kernel]()

flush_tlb_single() and flush_tlb_one() sound almost identical, but
they really mean "flush one user translation" and "flush one kernel
translation".  Rename them to flush_tlb_one_user() and
flush_tlb_one_kernel() to make the semantics more obvious.

[ I was looking at some PTI-related code, and the flush-one-address code
  is unnecessarily hard to understand because the names of the helpers are
  uninformative.  This came up during PTI review, but no one got around to
  doing it. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kees Cook <keescook@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux-MM <linux-mm@kvack.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/3303b02e3c3d049dc5235d5651e0ae6d29a34354.1517414378.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/paravirt.h       |  4 ++--
 arch/x86/include/asm/paravirt_types.h |  2 +-
 arch/x86/include/asm/pgtable_32.h     |  2 +-
 arch/x86/include/asm/tlbflush.h       | 27 ++++++++++++++++++++-------
 arch/x86/kernel/paravirt.c            |  6 +++---
 arch/x86/mm/init_64.c                 |  2 +-
 arch/x86/mm/ioremap.c                 |  2 +-
 arch/x86/mm/kmmio.c                   |  2 +-
 arch/x86/mm/pgtable_32.c              |  2 +-
 arch/x86/mm/tlb.c                     |  6 +++---
 arch/x86/platform/uv/tlb_uv.c         |  2 +-
 arch/x86/xen/mmu_pv.c                 |  6 +++---
 include/trace/events/xen.h            |  2 +-
 13 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 892df375b6155..554841fab717a 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -297,9 +297,9 @@ static inline void __flush_tlb_global(void)
 {
 	PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
 }
-static inline void __flush_tlb_single(unsigned long addr)
+static inline void __flush_tlb_one_user(unsigned long addr)
 {
-	PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
+	PVOP_VCALL1(pv_mmu_ops.flush_tlb_one_user, addr);
 }
 
 static inline void flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 6ec54d01972dc..f624f1f10316c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -217,7 +217,7 @@ struct pv_mmu_ops {
 	/* TLB operations */
 	void (*flush_tlb_user)(void);
 	void (*flush_tlb_kernel)(void);
-	void (*flush_tlb_single)(unsigned long addr);
+	void (*flush_tlb_one_user)(unsigned long addr);
 	void (*flush_tlb_others)(const struct cpumask *cpus,
 				 const struct flush_tlb_info *info);
 
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index e67c0620aec2a..e55466760ff8e 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -61,7 +61,7 @@ void paging_init(void);
 #define kpte_clear_flush(ptep, vaddr)		\
 do {						\
 	pte_clear(&init_mm, (vaddr), (ptep));	\
-	__flush_tlb_one((vaddr));		\
+	__flush_tlb_one_kernel((vaddr));		\
 } while (0)
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 2b8f18ca58747..84137c22fdfad 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -140,7 +140,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 #else
 #define __flush_tlb() __native_flush_tlb()
 #define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
 
 static inline bool tlb_defer_switch_to_init_mm(void)
@@ -400,7 +400,7 @@ static inline void __native_flush_tlb_global(void)
 /*
  * flush one page in the user mapping
  */
-static inline void __native_flush_tlb_single(unsigned long addr)
+static inline void __native_flush_tlb_one_user(unsigned long addr)
 {
 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
 
@@ -437,18 +437,31 @@ static inline void __flush_tlb_all(void)
 /*
  * flush one page in the kernel mapping
  */
-static inline void __flush_tlb_one(unsigned long addr)
+static inline void __flush_tlb_one_kernel(unsigned long addr)
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
-	__flush_tlb_single(addr);
+
+	/*
+	 * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+	 * paravirt equivalent.  Even with PCID, this is sufficient: we only
+	 * use PCID if we also use global PTEs for the kernel mapping, and
+	 * INVLPG flushes global translations across all address spaces.
+	 *
+	 * If PTI is on, then the kernel is mapped with non-global PTEs, and
+	 * __flush_tlb_one_user() will flush the given address for the current
+	 * kernel address space and for its usermode counterpart, but it does
+	 * not flush it for other address spaces.
+	 */
+	__flush_tlb_one_user(addr);
 
 	if (!static_cpu_has(X86_FEATURE_PTI))
 		return;
 
 	/*
-	 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
-	 * but since kernel space is replicated across all, we must also
-	 * invalidate all others.
+	 * See above.  We need to propagate the flush to all other address
+	 * spaces.  In principle, we only need to propagate it to kernelmode
+	 * address spaces, but the extra bookkeeping we would need is not
+	 * worth it.
 	 */
 	invalidate_other_asid();
 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 041096bdef860..99dc79e76bdc5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -200,9 +200,9 @@ static void native_flush_tlb_global(void)
 	__native_flush_tlb_global();
 }
 
-static void native_flush_tlb_single(unsigned long addr)
+static void native_flush_tlb_one_user(unsigned long addr)
 {
-	__native_flush_tlb_single(addr);
+	__native_flush_tlb_one_user(addr);
 }
 
 struct static_key paravirt_steal_enabled;
@@ -401,7 +401,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
 	.flush_tlb_user = native_flush_tlb,
 	.flush_tlb_kernel = native_flush_tlb_global,
-	.flush_tlb_single = native_flush_tlb_single,
+	.flush_tlb_one_user = native_flush_tlb_one_user,
 	.flush_tlb_others = native_flush_tlb_others,
 
 	.pgd_alloc = __paravirt_pgd_alloc,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 4a837289f2add..60ae1fe3609fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -256,7 +256,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
 	 * It's enough to flush this one mapping.
 	 * (PGE mappings get flushed as well)
 	 */
-	__flush_tlb_one(vaddr);
+	__flush_tlb_one_kernel(vaddr);
 }
 
 void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c45b6ec5357bc..e2db83bebc3b7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -820,5 +820,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
 		set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
 	else
 		pte_clear(&init_mm, addr, pte);
-	__flush_tlb_one(addr);
+	__flush_tlb_one_kernel(addr);
 }
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 58477ec3d66d0..7c86867096361 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -168,7 +168,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 		return -1;
 	}
 
-	__flush_tlb_one(f->addr);
+	__flush_tlb_one_kernel(f->addr);
 	return 0;
 }
 
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index c3c5274410a90..9bb7f0ab9fe62 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -63,7 +63,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	 * It's enough to flush this one mapping.
 	 * (PGE mappings get flushed as well)
 	 */
-	__flush_tlb_one(vaddr);
+	__flush_tlb_one_kernel(vaddr);
 }
 
 unsigned long __FIXADDR_TOP = 0xfffff000;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 012d026248489..0c936435ea939 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -492,7 +492,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
 	 *    processed on this CPU in reverse order, we'll see
 	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
-	 *    If we were to use __flush_tlb_single() and set local_tlb_gen to
+	 *    If we were to use __flush_tlb_one_user() and set local_tlb_gen to
 	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
 	 *    1 without the full flush that's needed for tlb_gen 2.
 	 *
@@ -513,7 +513,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 
 		addr = f->start;
 		while (addr < f->end) {
-			__flush_tlb_single(addr);
+			__flush_tlb_one_user(addr);
 			addr += PAGE_SIZE;
 		}
 		if (local)
@@ -660,7 +660,7 @@ static void do_kernel_range_flush(void *info)
 
 	/* flush range by one by one 'invlpg' */
 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
-		__flush_tlb_one(addr);
+		__flush_tlb_one_kernel(addr);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 8538a6723171a..7d5d53f36a7ab 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
 		local_flush_tlb();
 		stat->d_alltlb++;
 	} else {
-		__flush_tlb_single(msg->address);
+		__flush_tlb_one_user(msg->address);
 		stat->d_onetlb++;
 	}
 	stat->d_requestee++;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index d85076223a696..aae88fec9941a 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1300,12 +1300,12 @@ static void xen_flush_tlb(void)
 	preempt_enable();
 }
 
-static void xen_flush_tlb_single(unsigned long addr)
+static void xen_flush_tlb_one_user(unsigned long addr)
 {
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
-	trace_xen_mmu_flush_tlb_single(addr);
+	trace_xen_mmu_flush_tlb_one_user(addr);
 
 	preempt_disable();
 
@@ -2370,7 +2370,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 
 	.flush_tlb_user = xen_flush_tlb,
 	.flush_tlb_kernel = xen_flush_tlb,
-	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_one_user = xen_flush_tlb_one_user,
 	.flush_tlb_others = xen_flush_tlb_others,
 
 	.pgd_alloc = xen_pgd_alloc,
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index b8adf05c534e7..7dd8f34c37dfe 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -368,7 +368,7 @@ TRACE_EVENT(xen_mmu_flush_tlb,
 	    TP_printk("%s", "")
 	);
 
-TRACE_EVENT(xen_mmu_flush_tlb_single,
+TRACE_EVENT(xen_mmu_flush_tlb_one_user,
 	    TP_PROTO(unsigned long addr),
 	    TP_ARGS(addr),
 	    TP_STRUCT__entry(

From 961888b1d76d84efc66a8f5604b06ac12ac2f978 Mon Sep 17 00:00:00 2001
From: Rui Wang <rui.y.wang@intel.com>
Date: Mon, 18 Dec 2017 16:34:10 +0800
Subject: [PATCH 616/676] selftests/x86/mpx: Fix incorrect bounds with old
 _sigfault

For distributions with old userspace header files, the _sigfault
structure is different. mpx-mini-test fails with the following
error:

  [root@Purley]# mpx-mini-test_64 tabletest
  XSAVE is supported by HW & OS
  XSAVE processor supported state mask: 0x2ff
  XSAVE OS supported state mask: 0x2ff
   BNDREGS: size: 64 user: 1 supervisor: 0 aligned: 0
    BNDCSR: size: 64 user: 1 supervisor: 0 aligned: 0
  starting mpx bounds table test
  ERROR: siginfo bounds do not match shadow bounds for register 0

Fix it by using the correct offset of _lower/_upper in _sigfault.
RHEL needs this patch to work.

Signed-off-by: Rui Wang <rui.y.wang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave.hansen@linux.intel.com
Fixes: e754aedc26ef ("x86/mpx, selftests: Add MPX self test")
Link: http://lkml.kernel.org/r/1513586050-1641-1-git-send-email-rui.y.wang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/testing/selftests/x86/mpx-mini-test.c | 32 +++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
index ec0f6b45ce8b4..9c0325e1ea684 100644
--- a/tools/testing/selftests/x86/mpx-mini-test.c
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -315,11 +315,39 @@ static inline void *__si_bounds_upper(siginfo_t *si)
 	return si->si_upper;
 }
 #else
+
+/*
+ * This deals with old version of _sigfault in some distros:
+ *
+
+old _sigfault:
+        struct {
+            void *si_addr;
+	} _sigfault;
+
+new _sigfault:
+	struct {
+		void __user *_addr;
+		int _trapno;
+		short _addr_lsb;
+		union {
+			struct {
+				void __user *_lower;
+				void __user *_upper;
+			} _addr_bnd;
+			__u32 _pkey;
+		};
+	} _sigfault;
+ *
+ */
+
 static inline void **__si_bounds_hack(siginfo_t *si)
 {
 	void *sigfault = &si->_sifields._sigfault;
 	void *end_sigfault = sigfault + sizeof(si->_sifields._sigfault);
-	void **__si_lower = end_sigfault;
+	int *trapno = (int*)end_sigfault;
+	/* skip _trapno and _addr_lsb */
+	void **__si_lower = (void**)(trapno + 2);
 
 	return __si_lower;
 }
@@ -331,7 +359,7 @@ static inline void *__si_bounds_lower(siginfo_t *si)
 
 static inline void *__si_bounds_upper(siginfo_t *si)
 {
-	return (*__si_bounds_hack(si)) + sizeof(void *);
+	return *(__si_bounds_hack(si) + 1);
 }
 #endif
 

From b399151cb48db30ad1e0e93dd40d68c6d007b637 Mon Sep 17 00:00:00 2001
From: Jia Zhang <qianyue.zj@alibaba-inc.com>
Date: Mon, 1 Jan 2018 09:52:10 +0800
Subject: [PATCH 617/676] x86/cpu: Rename cpu_data.x86_mask to
 cpu_data.x86_stepping

x86_mask is a confusing name which is hard to associate with the
processor's stepping.

Additionally, correct an indent issue in lib/cpu.c.

Signed-off-by: Jia Zhang <qianyue.zj@alibaba-inc.com>
[ Updated it to more recent kernels. ]
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: tony.luck@intel.com
Link: http://lkml.kernel.org/r/1514771530-70829-1-git-send-email-qianyue.zj@alibaba-inc.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/core.c          |  2 +-
 arch/x86/events/intel/lbr.c           |  2 +-
 arch/x86/events/intel/p6.c            |  2 +-
 arch/x86/include/asm/acpi.h           |  2 +-
 arch/x86/include/asm/processor.h      |  2 +-
 arch/x86/kernel/amd_nb.c              |  2 +-
 arch/x86/kernel/apic/apic.c           |  6 +++---
 arch/x86/kernel/asm-offsets_32.c      |  2 +-
 arch/x86/kernel/cpu/amd.c             | 28 +++++++++++++--------------
 arch/x86/kernel/cpu/centaur.c         |  4 ++--
 arch/x86/kernel/cpu/common.c          |  8 ++++----
 arch/x86/kernel/cpu/cyrix.c           |  2 +-
 arch/x86/kernel/cpu/intel.c           | 18 ++++++++---------
 arch/x86/kernel/cpu/intel_rdt.c       |  2 +-
 arch/x86/kernel/cpu/microcode/intel.c |  4 ++--
 arch/x86/kernel/cpu/mtrr/generic.c    |  2 +-
 arch/x86/kernel/cpu/mtrr/main.c       |  4 ++--
 arch/x86/kernel/cpu/proc.c            |  4 ++--
 arch/x86/kernel/head_32.S             |  4 ++--
 arch/x86/kernel/mpparse.c             |  2 +-
 arch/x86/lib/cpu.c                    |  2 +-
 drivers/char/hw_random/via-rng.c      |  2 +-
 drivers/cpufreq/acpi-cpufreq.c        |  2 +-
 drivers/cpufreq/longhaul.c            |  6 +++---
 drivers/cpufreq/p4-clockmod.c         |  2 +-
 drivers/cpufreq/powernow-k7.c         |  2 +-
 drivers/cpufreq/speedstep-centrino.c  |  4 ++--
 drivers/cpufreq/speedstep-lib.c       |  6 +++---
 drivers/crypto/padlock-aes.c          |  2 +-
 drivers/edac/amd64_edac.c             |  2 +-
 drivers/hwmon/coretemp.c              |  6 +++---
 drivers/hwmon/hwmon-vid.c             |  2 +-
 drivers/hwmon/k10temp.c               |  2 +-
 drivers/hwmon/k8temp.c                |  2 +-
 drivers/video/fbdev/geode/video_gx.c  |  2 +-
 35 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 731153a4681e7..56457cb73448b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3559,7 +3559,7 @@ static int intel_snb_pebs_broken(int cpu)
 		break;
 
 	case INTEL_FAM6_SANDYBRIDGE_X:
-		switch (cpu_data(cpu).x86_mask) {
+		switch (cpu_data(cpu).x86_stepping) {
 		case 6: rev = 0x618; break;
 		case 7: rev = 0x70c; break;
 		}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index ae64d0b69729d..cf372b90557ed 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1186,7 +1186,7 @@ void __init intel_pmu_lbr_init_atom(void)
 	 * on PMU interrupt
 	 */
 	if (boot_cpu_data.x86_model == 28
-	    && boot_cpu_data.x86_mask < 10) {
+	    && boot_cpu_data.x86_stepping < 10) {
 		pr_cont("LBR disabled due to erratum");
 		return;
 	}
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index a5604c3529308..408879b0c0d4e 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -234,7 +234,7 @@ static __initconst const struct x86_pmu p6_pmu = {
 
 static __init void p6_pmu_rdpmc_quirk(void)
 {
-	if (boot_cpu_data.x86_mask < 9) {
+	if (boot_cpu_data.x86_stepping < 9) {
 		/*
 		 * PPro erratum 26; fixed in stepping 9 and above.
 		 */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 8d0ec9df1cbeb..f077401869ee2 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
 	if (boot_cpu_data.x86 == 0x0F &&
 	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
 	    boot_cpu_data.x86_model <= 0x05 &&
-	    boot_cpu_data.x86_mask < 0x0A)
+	    boot_cpu_data.x86_stepping < 0x0A)
 		return 1;
 	else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E))
 		return 1;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 99799fbd0f7e0..b7c8583328c77 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -91,7 +91,7 @@ struct cpuinfo_x86 {
 	__u8			x86;		/* CPU family */
 	__u8			x86_vendor;	/* CPU vendor */
 	__u8			x86_model;
-	__u8			x86_mask;
+	__u8			x86_stepping;
 #ifdef CONFIG_X86_64
 	/* Number of 4K pages in DTLB/ITLB combined(in pages): */
 	int			x86_tlbsize;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 6db28f17ff288..c88e0b127810f 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -235,7 +235,7 @@ int amd_cache_northbridges(void)
 	if (boot_cpu_data.x86 == 0x10 &&
 	    boot_cpu_data.x86_model >= 0x8 &&
 	    (boot_cpu_data.x86_model > 0x9 ||
-	     boot_cpu_data.x86_mask >= 0x1))
+	     boot_cpu_data.x86_stepping >= 0x1))
 		amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
 
 	if (boot_cpu_data.x86 == 0x15)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 25ddf02598d20..b203af0855b57 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -546,7 +546,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
 static u32 hsx_deadline_rev(void)
 {
-	switch (boot_cpu_data.x86_mask) {
+	switch (boot_cpu_data.x86_stepping) {
 	case 0x02: return 0x3a; /* EP */
 	case 0x04: return 0x0f; /* EX */
 	}
@@ -556,7 +556,7 @@ static u32 hsx_deadline_rev(void)
 
 static u32 bdx_deadline_rev(void)
 {
-	switch (boot_cpu_data.x86_mask) {
+	switch (boot_cpu_data.x86_stepping) {
 	case 0x02: return 0x00000011;
 	case 0x03: return 0x0700000e;
 	case 0x04: return 0x0f00000c;
@@ -568,7 +568,7 @@ static u32 bdx_deadline_rev(void)
 
 static u32 skx_deadline_rev(void)
 {
-	switch (boot_cpu_data.x86_mask) {
+	switch (boot_cpu_data.x86_stepping) {
 	case 0x03: return 0x01000136;
 	case 0x04: return 0x02000014;
 	}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fa1261eefa16e..f91ba53e06c8b 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,7 +18,7 @@ void foo(void)
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
-	OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask);
+	OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping);
 	OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level);
 	OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability);
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ea831c8581958..e7d5a7883632c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -119,7 +119,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if (c->x86_model == 6 && c->x86_mask == 1) {
+	if (c->x86_model == 6 && c->x86_stepping == 1) {
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
@@ -149,7 +149,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 	/* K6 with old style WHCR */
 	if (c->x86_model < 8 ||
-	   (c->x86_model == 8 && c->x86_mask < 8)) {
+	   (c->x86_model == 8 && c->x86_stepping < 8)) {
 		/* We can only write allocate on the low 508Mb */
 		if (mbytes > 508)
 			mbytes = 508;
@@ -168,7 +168,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if ((c->x86_model == 8 && c->x86_mask > 7) ||
+	if ((c->x86_model == 8 && c->x86_stepping > 7) ||
 	     c->x86_model == 9 || c->x86_model == 13) {
 		/* The more serious chips .. */
 
@@ -221,7 +221,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
 	 * As per AMD technical note 27212 0.2
 	 */
-	if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+	if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
 		rdmsr(MSR_K7_CLK_CTL, l, h);
 		if ((l & 0xfff00000) != 0x20000000) {
 			pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
@@ -241,12 +241,12 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	 * but they are not certified as MP capable.
 	 */
 	/* Athlon 660/661 is valid. */
-	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-	    (c->x86_mask == 1)))
+	if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+	    (c->x86_stepping == 1)))
 		return;
 
 	/* Duron 670 is valid */
-	if ((c->x86_model == 7) && (c->x86_mask == 0))
+	if ((c->x86_model == 7) && (c->x86_stepping == 0))
 		return;
 
 	/*
@@ -256,8 +256,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
 	 * more.
 	 */
-	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
 	     (c->x86_model > 7))
 		if (cpu_has(c, X86_FEATURE_MP))
 			return;
@@ -583,7 +583,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	/*  Set MTRR capability flag if appropriate */
 	if (c->x86 == 5)
 		if (c->x86_model == 13 || c->x86_model == 9 ||
-		    (c->x86_model == 8 && c->x86_mask >= 8))
+		    (c->x86_model == 8 && c->x86_stepping >= 8))
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
@@ -769,7 +769,7 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
 	 * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
 	 * all up to and including B1.
 	 */
-	if (c->x86_model <= 1 && c->x86_mask <= 1)
+	if (c->x86_model <= 1 && c->x86_stepping <= 1)
 		set_cpu_cap(c, X86_FEATURE_CPB);
 }
 
@@ -880,11 +880,11 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 	/* AMD errata T13 (order #21922) */
 	if ((c->x86 == 6)) {
 		/* Duron Rev A0 */
-		if (c->x86_model == 3 && c->x86_mask == 0)
+		if (c->x86_model == 3 && c->x86_stepping == 0)
 			size = 64;
 		/* Tbird rev A1/A2 */
 		if (c->x86_model == 4 &&
-			(c->x86_mask == 0 || c->x86_mask == 1))
+			(c->x86_stepping == 0 || c->x86_stepping == 1))
 			size = 256;
 	}
 	return size;
@@ -1021,7 +1021,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
 	}
 
 	/* OSVW unavailable or ID unknown, match family-model-stepping range */
-	ms = (cpu->x86_model << 4) | cpu->x86_mask;
+	ms = (cpu->x86_model << 4) | cpu->x86_stepping;
 	while ((range = *erratum++))
 		if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
 		    (ms >= AMD_MODEL_RANGE_START(range)) &&
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 68bc6d9b31326..595be776727d8 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -136,7 +136,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
 			clear_cpu_cap(c, X86_FEATURE_TSC);
 			break;
 		case 8:
-			switch (c->x86_mask) {
+			switch (c->x86_stepping) {
 			default:
 			name = "2";
 				break;
@@ -211,7 +211,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 	 *  - Note, it seems this may only be in engineering samples.
 	 */
 	if ((c->x86 == 6) && (c->x86_model == 9) &&
-				(c->x86_mask == 1) && (size == 65))
+				(c->x86_stepping == 1) && (size == 65))
 		size -= 1;
 	return size;
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d63f4b5706e4d..a7d8df641a4c0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -731,7 +731,7 @@ void cpu_detect(struct cpuinfo_x86 *c)
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
 		c->x86		= x86_family(tfms);
 		c->x86_model	= x86_model(tfms);
-		c->x86_mask	= x86_stepping(tfms);
+		c->x86_stepping	= x86_stepping(tfms);
 
 		if (cap0 & (1<<19)) {
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -1186,7 +1186,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	c->loops_per_jiffy = loops_per_jiffy;
 	c->x86_cache_size = -1;
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
+	c->x86_model = c->x86_stepping = 0;	/* So far unknown... */
 	c->x86_vendor_id[0] = '\0'; /* Unset */
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
@@ -1378,8 +1378,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 
 	pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
-	if (c->x86_mask || c->cpuid_level >= 0)
-		pr_cont(", stepping: 0x%x)\n", c->x86_mask);
+	if (c->x86_stepping || c->cpuid_level >= 0)
+		pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
 	else
 		pr_cont(")\n");
 }
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 6b4bb335641f3..8949b7ae6d925 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -215,7 +215,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 
 	/* common case step number/rev -- exceptions handled below */
 	c->x86_model = (dir1 >> 4) + 1;
-	c->x86_mask = dir1 & 0xf;
+	c->x86_stepping = dir1 & 0xf;
 
 	/* Now cook; the original recipe is by Channing Corn, from Cyrix.
 	 * We do the same thing for each generation: we work out
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ef796f14f7ae5..d19e903214b40 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -146,7 +146,7 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
 
 	for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
 		if (c->x86_model == spectre_bad_microcodes[i].model &&
-		    c->x86_mask == spectre_bad_microcodes[i].stepping)
+		    c->x86_stepping == spectre_bad_microcodes[i].stepping)
 			return (c->microcode <= spectre_bad_microcodes[i].microcode);
 	}
 	return false;
@@ -193,7 +193,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	 * need the microcode to have already been loaded... so if it is
 	 * not, recommend a BIOS update and disable large pages.
 	 */
-	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
+	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
 	    c->microcode < 0x20e) {
 		pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
 		clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -209,7 +209,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 
 	/* CPUID workaround for 0F33/0F34 CPU */
 	if (c->x86 == 0xF && c->x86_model == 0x3
-	    && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+	    && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
 		c->x86_phys_bits = 36;
 
 	/*
@@ -307,7 +307,7 @@ int ppro_with_ram_bug(void)
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
 	    boot_cpu_data.x86 == 6 &&
 	    boot_cpu_data.x86_model == 1 &&
-	    boot_cpu_data.x86_mask < 8) {
+	    boot_cpu_data.x86_stepping < 8) {
 		pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
 		return 1;
 	}
@@ -324,7 +324,7 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
 	 * Mask B, Pentium, but not Pentium MMX
 	 */
 	if (c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_stepping >= 1 && c->x86_stepping <= 4 &&
 	    c->x86_model <= 3) {
 		/*
 		 * Remember we have B step Pentia with bugs
@@ -367,7 +367,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
 	 * model 3 mask 3
 	 */
-	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+	if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633)
 		clear_cpu_cap(c, X86_FEATURE_SEP);
 
 	/*
@@ -385,7 +385,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * P4 Xeon erratum 037 workaround.
 	 * Hardware prefetcher may cause stale data to be loaded into the cache.
 	 */
-	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
+	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) {
 		if (msr_set_bit(MSR_IA32_MISC_ENABLE,
 				MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
 			pr_info("CPU: C0 stepping P4 Xeon detected.\n");
@@ -400,7 +400,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * Specification Update").
 	 */
 	if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
-	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+	    (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
 		set_cpu_bug(c, X86_BUG_11AP);
 
 
@@ -647,7 +647,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 		case 6:
 			if (l2 == 128)
 				p = "Celeron (Mendocino)";
-			else if (c->x86_mask == 0 || c->x86_mask == 5)
+			else if (c->x86_stepping == 0 || c->x86_stepping == 5)
 				p = "Celeron-A";
 			break;
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index 99442370de40d..18dd8f22e353a 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -771,7 +771,7 @@ static __init void rdt_quirks(void)
 			cache_alloc_hsw_probe();
 		break;
 	case INTEL_FAM6_SKYLAKE_X:
-		if (boot_cpu_data.x86_mask <= 4)
+		if (boot_cpu_data.x86_stepping <= 4)
 			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
 	}
 }
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f7c55b0e753ad..b94279bb5c046 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -921,7 +921,7 @@ static bool is_blacklisted(unsigned int cpu)
 	 */
 	if (c->x86 == 6 &&
 	    c->x86_model == INTEL_FAM6_BROADWELL_X &&
-	    c->x86_mask == 0x01 &&
+	    c->x86_stepping == 0x01 &&
 	    llc_size_per_core > 2621440 &&
 	    c->microcode < 0x0b000021) {
 		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
@@ -944,7 +944,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 		return UCODE_NFOUND;
 
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
-		c->x86, c->x86_model, c->x86_mask);
+		c->x86, c->x86_model, c->x86_stepping);
 
 	if (request_firmware_direct(&firmware, name, device)) {
 		pr_debug("data file %s load failed\n", name);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fdc55215d44d0..e12ee86906c62 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -859,7 +859,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
 	 */
 	if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
 	    boot_cpu_data.x86_model == 1 &&
-	    boot_cpu_data.x86_mask <= 7) {
+	    boot_cpu_data.x86_stepping <= 7) {
 		if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
 			pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
 			return -EINVAL;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 40d5a8a752125..7468de4290873 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -711,8 +711,8 @@ void __init mtrr_bp_init(void)
 			if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
 			    boot_cpu_data.x86 == 0xF &&
 			    boot_cpu_data.x86_model == 0x3 &&
-			    (boot_cpu_data.x86_mask == 0x3 ||
-			     boot_cpu_data.x86_mask == 0x4))
+			    (boot_cpu_data.x86_stepping == 0x3 ||
+			     boot_cpu_data.x86_stepping == 0x4))
 				phys_addr = 36;
 
 			size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7ecedafa1c8f..ee4cc388e8d30 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -72,8 +72,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		   c->x86_model,
 		   c->x86_model_id[0] ? c->x86_model_id : "unknown");
 
-	if (c->x86_mask || c->cpuid_level >= 0)
-		seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+	if (c->x86_stepping || c->cpuid_level >= 0)
+		seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
 	else
 		seq_puts(m, "stepping\t: unknown\n");
 	if (c->microcode)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index c29020907886a..b59e4fb40fd99 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -37,7 +37,7 @@
 #define X86		new_cpu_data+CPUINFO_x86
 #define X86_VENDOR	new_cpu_data+CPUINFO_x86_vendor
 #define X86_MODEL	new_cpu_data+CPUINFO_x86_model
-#define X86_MASK	new_cpu_data+CPUINFO_x86_mask
+#define X86_STEPPING	new_cpu_data+CPUINFO_x86_stepping
 #define X86_HARD_MATH	new_cpu_data+CPUINFO_hard_math
 #define X86_CPUID	new_cpu_data+CPUINFO_cpuid_level
 #define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
@@ -332,7 +332,7 @@ ENTRY(startup_32_smp)
 	shrb $4,%al
 	movb %al,X86_MODEL
 	andb $0x0f,%cl		# mask mask revision
-	movb %cl,X86_MASK
+	movb %cl,X86_STEPPING
 	movl %edx,X86_CAPABILITY
 
 .Lis486:
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3a4b12809ab5f..bc6bc6689e68a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -407,7 +407,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
 	processor.cpuflag = CPU_ENABLED;
 	processor.cpufeature = (boot_cpu_data.x86 << 8) |
-	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_stepping;
 	processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
 	processor.reserved[0] = 0;
 	processor.reserved[1] = 0;
diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c
index d6f848d1211d4..2dd1fe13a37b3 100644
--- a/arch/x86/lib/cpu.c
+++ b/arch/x86/lib/cpu.c
@@ -18,7 +18,7 @@ unsigned int x86_model(unsigned int sig)
 {
 	unsigned int fam, model;
 
-	 fam = x86_family(sig);
+	fam = x86_family(sig);
 
 	model = (sig >> 4) & 0xf;
 
diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c
index d1f5bb534e0e3..6e9df558325be 100644
--- a/drivers/char/hw_random/via-rng.c
+++ b/drivers/char/hw_random/via-rng.c
@@ -162,7 +162,7 @@ static int via_rng_init(struct hwrng *rng)
 	/* Enable secondary noise source on CPUs where it is present. */
 
 	/* Nehemiah stepping 8 and higher */
-	if ((c->x86_model == 9) && (c->x86_mask > 7))
+	if ((c->x86_model == 9) && (c->x86_stepping > 7))
 		lo |= VIA_NOISESRC2;
 
 	/* Esther */
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 3a2ca0f79daf2..d0c34df0529c8 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -629,7 +629,7 @@ static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
 	if (c->x86_vendor == X86_VENDOR_INTEL) {
 		if ((c->x86 == 15) &&
 		    (c->x86_model == 6) &&
-		    (c->x86_mask == 8)) {
+		    (c->x86_stepping == 8)) {
 			pr_info("Intel(R) Xeon(R) 7100 Errata AL30, processors may lock up on frequency changes: disabling acpi-cpufreq\n");
 			return -ENODEV;
 		    }
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index c46a12df40dd8..d5e27bc7585a9 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -775,7 +775,7 @@ static int longhaul_cpu_init(struct cpufreq_policy *policy)
 		break;
 
 	case 7:
-		switch (c->x86_mask) {
+		switch (c->x86_stepping) {
 		case 0:
 			longhaul_version = TYPE_LONGHAUL_V1;
 			cpu_model = CPU_SAMUEL2;
@@ -787,7 +787,7 @@ static int longhaul_cpu_init(struct cpufreq_policy *policy)
 			break;
 		case 1 ... 15:
 			longhaul_version = TYPE_LONGHAUL_V2;
-			if (c->x86_mask < 8) {
+			if (c->x86_stepping < 8) {
 				cpu_model = CPU_SAMUEL2;
 				cpuname = "C3 'Samuel 2' [C5B]";
 			} else {
@@ -814,7 +814,7 @@ static int longhaul_cpu_init(struct cpufreq_policy *policy)
 		numscales = 32;
 		memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
 		memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
-		switch (c->x86_mask) {
+		switch (c->x86_stepping) {
 		case 0 ... 1:
 			cpu_model = CPU_NEHEMIAH;
 			cpuname = "C3 'Nehemiah A' [C5XLOE]";
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index fd77812313f3e..a25741b1281b4 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -168,7 +168,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 #endif
 
 	/* Errata workaround */
-	cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
+	cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_stepping;
 	switch (cpuid) {
 	case 0x0f07:
 	case 0x0f0a:
diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c
index 80ac313e6c59c..302e9ce793a01 100644
--- a/drivers/cpufreq/powernow-k7.c
+++ b/drivers/cpufreq/powernow-k7.c
@@ -131,7 +131,7 @@ static int check_powernow(void)
 		return 0;
 	}
 
-	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
+	if ((c->x86_model == 6) && (c->x86_stepping == 0)) {
 		pr_info("K7 660[A0] core detected, enabling errata workarounds\n");
 		have_a0 = 1;
 	}
diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c
index 41bc5397f4bbb..4fa5adf16c701 100644
--- a/drivers/cpufreq/speedstep-centrino.c
+++ b/drivers/cpufreq/speedstep-centrino.c
@@ -37,7 +37,7 @@ struct cpu_id
 {
 	__u8	x86;            /* CPU family */
 	__u8	x86_model;	/* model */
-	__u8	x86_mask;	/* stepping */
+	__u8	x86_stepping;	/* stepping */
 };
 
 enum {
@@ -277,7 +277,7 @@ static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
 {
 	if ((c->x86 == x->x86) &&
 	    (c->x86_model == x->x86_model) &&
-	    (c->x86_mask == x->x86_mask))
+	    (c->x86_stepping == x->x86_stepping))
 		return 1;
 	return 0;
 }
diff --git a/drivers/cpufreq/speedstep-lib.c b/drivers/cpufreq/speedstep-lib.c
index 8085ec9000d19..e3a9962ee4109 100644
--- a/drivers/cpufreq/speedstep-lib.c
+++ b/drivers/cpufreq/speedstep-lib.c
@@ -272,9 +272,9 @@ unsigned int speedstep_detect_processor(void)
 		ebx = cpuid_ebx(0x00000001);
 		ebx &= 0x000000FF;
 
-		pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
+		pr_debug("ebx value is %x, x86_stepping is %x\n", ebx, c->x86_stepping);
 
-		switch (c->x86_mask) {
+		switch (c->x86_stepping) {
 		case 4:
 			/*
 			 * B-stepping [M-P4-M]
@@ -361,7 +361,7 @@ unsigned int speedstep_detect_processor(void)
 				msr_lo, msr_hi);
 		if ((msr_hi & (1<<18)) &&
 		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
-			if (c->x86_mask == 0x01) {
+			if (c->x86_stepping == 0x01) {
 				pr_debug("early PIII version\n");
 				return SPEEDSTEP_CPU_PIII_C_EARLY;
 			} else
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index 4b6642a25df51..1c6cbda56afe9 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -512,7 +512,7 @@ static int __init padlock_init(void)
 
 	printk(KERN_NOTICE PFX "Using VIA PadLock ACE for AES algorithm.\n");
 
-	if (c->x86 == 6 && c->x86_model == 15 && c->x86_mask == 2) {
+	if (c->x86 == 6 && c->x86_model == 15 && c->x86_stepping == 2) {
 		ecb_fetch_blocks = MAX_ECB_FETCH_BLOCKS;
 		cbc_fetch_blocks = MAX_CBC_FETCH_BLOCKS;
 		printk(KERN_NOTICE PFX "VIA Nano stepping 2 detected: enabling workaround.\n");
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 8b16ec595fa72..329cb96f886fd 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3147,7 +3147,7 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt)
 	struct amd64_family_type *fam_type = NULL;
 
 	pvt->ext_model  = boot_cpu_data.x86_model >> 4;
-	pvt->stepping	= boot_cpu_data.x86_mask;
+	pvt->stepping	= boot_cpu_data.x86_stepping;
 	pvt->model	= boot_cpu_data.x86_model;
 	pvt->fam	= boot_cpu_data.x86;
 
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index c13a4fd86b3cb..a42744c7665b5 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -268,13 +268,13 @@ static int adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *dev)
 	for (i = 0; i < ARRAY_SIZE(tjmax_model_table); i++) {
 		const struct tjmax_model *tm = &tjmax_model_table[i];
 		if (c->x86_model == tm->model &&
-		    (tm->mask == ANY || c->x86_mask == tm->mask))
+		    (tm->mask == ANY || c->x86_stepping == tm->mask))
 			return tm->tjmax;
 	}
 
 	/* Early chips have no MSR for TjMax */
 
-	if (c->x86_model == 0xf && c->x86_mask < 4)
+	if (c->x86_model == 0xf && c->x86_stepping < 4)
 		usemsr_ee = 0;
 
 	if (c->x86_model > 0xe && usemsr_ee) {
@@ -425,7 +425,7 @@ static int chk_ucode_version(unsigned int cpu)
 	 * Readings might stop update when processor visited too deep sleep,
 	 * fixed for stepping D0 (6EC).
 	 */
-	if (c->x86_model == 0xe && c->x86_mask < 0xc && c->microcode < 0x39) {
+	if (c->x86_model == 0xe && c->x86_stepping < 0xc && c->microcode < 0x39) {
 		pr_err("Errata AE18 not fixed, update BIOS or microcode of the CPU!\n");
 		return -ENODEV;
 	}
diff --git a/drivers/hwmon/hwmon-vid.c b/drivers/hwmon/hwmon-vid.c
index ef91b8a675492..84e91286fc4fd 100644
--- a/drivers/hwmon/hwmon-vid.c
+++ b/drivers/hwmon/hwmon-vid.c
@@ -293,7 +293,7 @@ u8 vid_which_vrm(void)
 	if (c->x86 < 6)		/* Any CPU with family lower than 6 */
 		return 0;	/* doesn't have VID */
 
-	vrm_ret = find_vrm(c->x86, c->x86_model, c->x86_mask, c->x86_vendor);
+	vrm_ret = find_vrm(c->x86, c->x86_model, c->x86_stepping, c->x86_vendor);
 	if (vrm_ret == 134)
 		vrm_ret = get_via_model_d_vrm();
 	if (vrm_ret == 0)
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 0721e175664ae..b960015cb073d 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -226,7 +226,7 @@ static bool has_erratum_319(struct pci_dev *pdev)
 	 * and AM3 formats, but that's the best we can do.
 	 */
 	return boot_cpu_data.x86_model < 4 ||
-	       (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask <= 2);
+	       (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_stepping <= 2);
 }
 
 static int k10temp_probe(struct pci_dev *pdev,
diff --git a/drivers/hwmon/k8temp.c b/drivers/hwmon/k8temp.c
index 5a632bcf869bb..e59f9113fb93b 100644
--- a/drivers/hwmon/k8temp.c
+++ b/drivers/hwmon/k8temp.c
@@ -187,7 +187,7 @@ static int k8temp_probe(struct pci_dev *pdev,
 		return -ENOMEM;
 
 	model = boot_cpu_data.x86_model;
-	stepping = boot_cpu_data.x86_mask;
+	stepping = boot_cpu_data.x86_stepping;
 
 	/* feature available since SH-C0, exclude older revisions */
 	if ((model == 4 && stepping == 0) ||
diff --git a/drivers/video/fbdev/geode/video_gx.c b/drivers/video/fbdev/geode/video_gx.c
index 6082f653c68a4..67773e8bbb954 100644
--- a/drivers/video/fbdev/geode/video_gx.c
+++ b/drivers/video/fbdev/geode/video_gx.c
@@ -127,7 +127,7 @@ void gx_set_dclk_frequency(struct fb_info *info)
 	int timeout = 1000;
 
 	/* Rev. 1 Geode GXs use a 14 MHz reference clock instead of 48 MHz. */
-	if (cpu_data(0).x86_mask == 1) {
+	if (cpu_data(0).x86_stepping == 1) {
 		pll_table = gx_pll_table_14MHz;
 		pll_table_len = ARRAY_SIZE(gx_pll_table_14MHz);
 	} else {

From 9de29eac8d2189424d81c0d840cd0469aa3d41c8 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 14 Feb 2018 10:14:17 +0300
Subject: [PATCH 618/676] x86/spectre: Fix an error message

If i == ARRAY_SIZE(mitigation_options) then we accidentally print
garbage from one space beyond the end of the mitigation_options[] array.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: KarimAllah Ahmed <karahmed@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Fixes: 9005c6834c0f ("x86/spectre: Simplify spectre_v2 command line parsing")
Link: http://lkml.kernel.org/r/20180214071416.GA26677@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 4acf16a76d1e4..d71c8b54b696d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -174,7 +174,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 		}
 
 		if (i >= ARRAY_SIZE(mitigation_options)) {
-			pr_err("unknown option (%s). Switching to AUTO select\n", mitigation_options[i].option);
+			pr_err("unknown option (%s). Switching to AUTO select\n", arg);
 			return SPECTRE_V2_CMD_AUTO;
 		}
 	}

From 24dbc6000f4b9b0ef5a9daecb161f1907733765a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 13 Feb 2018 13:22:08 -0600
Subject: [PATCH 619/676] x86/cpu: Change type of x86_cache_size variable to
 unsigned int

Currently, x86_cache_size is of type int, which makes no sense as we
will never have a valid cache size equal or less than 0. So instead of
initializing this variable to -1, it can perfectly be initialized to 0
and use it as an unsigned variable instead.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Addresses-Coverity-ID: 1464429
Link: http://lkml.kernel.org/r/20180213192208.GA26414@embeddedor.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h      | 2 +-
 arch/x86/kernel/cpu/common.c          | 2 +-
 arch/x86/kernel/cpu/microcode/intel.c | 2 +-
 arch/x86/kernel/cpu/proc.c            | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b7c8583328c77..44c2c4ec6d60e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,7 +109,7 @@ struct cpuinfo_x86 {
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
 	/* in KB - valid for CPUS which support this call: */
-	int			x86_cache_size;
+	unsigned int		x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
 	/* Cache QoS architectural values: */
 	int			x86_cache_max_rmid;	/* max index */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a7d8df641a4c0..824aee0117bb5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1184,7 +1184,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	int i;
 
 	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
+	c->x86_cache_size = 0;
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	c->x86_model = c->x86_stepping = 0;	/* So far unknown... */
 	c->x86_vendor_id[0] = '\0'; /* Unset */
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index b94279bb5c046..a15db2b4e0d66 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -982,7 +982,7 @@ static struct microcode_ops microcode_intel_ops = {
 
 static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
 {
-	u64 llc_size = c->x86_cache_size * 1024;
+	u64 llc_size = c->x86_cache_size * 1024ULL;
 
 	do_div(llc_size, c->x86_max_cores);
 
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index ee4cc388e8d30..2c8522a39ed5d 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -91,8 +91,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	}
 
 	/* Cache size */
-	if (c->x86_cache_size >= 0)
-		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+	if (c->x86_cache_size)
+		seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
 
 	show_cpuinfo_core(m, c, cpu);
 	show_cpuinfo_misc(m, c);

From e48657573481a5dff7cfdc3d57005c80aa816500 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 14 Feb 2018 08:39:11 +0100
Subject: [PATCH 620/676] x86/entry/64: Fix CR3 restore in paranoid_exit()

Josh Poimboeuf noticed the following bug:

 "The paranoid exit code only restores the saved CR3 when it switches back
  to the user GS.  However, even in the kernel GS case, it's possible that
  it needs to restore a user CR3, if for example, the paranoid exception
  occurred in the syscall exit path between SWITCH_TO_USER_CR3_STACK and
  SWAPGS."

Josh also confirmed via targeted testing that it's possible to hit this bug.

Fix the bug by also restoring CR3 in the paranoid_exit_no_swapgs branch.

The reason we haven't seen this bug reported by users yet is probably because
"paranoid" entry points are limited to the following cases:

 idtentry double_fault       do_double_fault  has_error_code=1  paranoid=2
 idtentry debug              do_debug         has_error_code=0  paranoid=1 shift_ist=DEBUG_STACK
 idtentry int3               do_int3          has_error_code=0  paranoid=1 shift_ist=DEBUG_STACK
 idtentry machine_check      do_mce           has_error_code=0  paranoid=1

Amongst those entry points only machine_check is one that will interrupt an
IRQS-off critical section asynchronously - and machine check events are rare.

The other main asynchronous entries are NMI entries, which can be very high-freq
with perf profiling, but they are special: they don't use the 'idtentry' macro but
are open coded and restore user CR3 unconditionally so don't have this bug.

Reported-and-tested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180214073910.boevmg65upbk3vqb@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1c54204207d8d..4fd9044e72e78 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1168,6 +1168,7 @@ ENTRY(paranoid_exit)
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 .Lparanoid_exit_restore:
 	jmp restore_regs_and_return_to_kernel
 END(paranoid_exit)

From 6e1d8ea90932f77843730ada0bfea63093b7212a Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Wed, 14 Feb 2018 14:55:24 +0300
Subject: [PATCH 621/676] platform/x86: wmi: fix off-by-one write in
 wmi_dev_probe()

wmi_dev_probe() allocates one byte less than necessary, thus
subsequent sprintf() call writes trailing zero past the end
of the 'buf':

    BUG: KASAN: slab-out-of-bounds in vsnprintf+0xda4/0x1240
    Write of size 1 at addr ffff880423529caf by task kworker/1:1/32

    Call Trace:
     dump_stack+0xb3/0x14d
     print_address_description+0xd7/0x380
     kasan_report+0x166/0x2b0
     vsnprintf+0xda4/0x1240
     sprintf+0x9b/0xd0
     wmi_dev_probe+0x1c3/0x400
     driver_probe_device+0x5d1/0x990
     bus_for_each_drv+0x109/0x190
     __device_attach+0x217/0x360
     bus_probe_device+0x1ad/0x260
     deferred_probe_work_func+0x10f/0x5d0
     process_one_work+0xa8b/0x1dc0
     worker_thread+0x20d/0x17d0
     kthread+0x311/0x3d0
     ret_from_fork+0x3a/0x50

    Allocated by task 32:
     kasan_kmalloc+0xa0/0xd0
     __kmalloc+0x14f/0x3e0
     wmi_dev_probe+0x182/0x400
     driver_probe_device+0x5d1/0x990
     bus_for_each_drv+0x109/0x190
     __device_attach+0x217/0x360
     bus_probe_device+0x1ad/0x260
     deferred_probe_work_func+0x10f/0x5d0
     process_one_work+0xa8b/0x1dc0
     worker_thread+0x20d/0x17d0
     kthread+0x311/0x3d0
     ret_from_fork+0x3a/0x50

Increment allocation size to fix this.

Fixes: 44b6b7661132 ("platform/x86: wmi: create userspace interface for drivers")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index daa68acbc9003..c0c8945603cbb 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -933,7 +933,7 @@ static int wmi_dev_probe(struct device *dev)
 			goto probe_failure;
 		}
 
-		buf = kmalloc(strlen(wdriver->driver.name) + 4, GFP_KERNEL);
+		buf = kmalloc(strlen(wdriver->driver.name) + 5, GFP_KERNEL);
 		if (!buf) {
 			ret = -ENOMEM;
 			goto probe_string_failure;

From ed5b9ba7bef7f277cbdf315e385b44e0e3b1a9ab Mon Sep 17 00:00:00 2001
From: Aaron Ma <aaron.ma@canonical.com>
Date: Sun, 11 Feb 2018 17:18:49 +0800
Subject: [PATCH 622/676] platform/x86: ideapad-laptop: Increase timeout to
 wait for EC answer

Lenovo E41-20 needs more time than 100ms to read VPC,
the funtion keys always failed responding.
Increase timeout to get the value from VPC, then
the funtion keys like mic mute key work well.

Signed-off-by: Aaron Ma <aaron.ma@canonical.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/ideapad-laptop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c
index 5b6f18b188012..535199c9e6bc6 100644
--- a/drivers/platform/x86/ideapad-laptop.c
+++ b/drivers/platform/x86/ideapad-laptop.c
@@ -113,7 +113,7 @@ MODULE_PARM_DESC(no_bt_rfkill, "No rfkill for bluetooth.");
 /*
  * ACPI Helpers
  */
-#define IDEAPAD_EC_TIMEOUT (100) /* in ms */
+#define IDEAPAD_EC_TIMEOUT (200) /* in ms */
 
 static int read_method_int(acpi_handle handle, const char *method, int *val)
 {

From eca39e7f0cdb9bde4003a29149fa695e876c6f73 Mon Sep 17 00:00:00 2001
From: Laszlo Toth <laszlth@gmail.com>
Date: Tue, 13 Feb 2018 21:43:43 +0100
Subject: [PATCH 623/676] platform/x86: dell-laptop: fix kbd_get_state's
 request value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 9862b43624a5 ("platform/x86: dell-laptop: Allocate buffer on heap
rather than globally")
broke one request, changed it back to the original value.

Tested on a Dell E6540, backlight came back.

Fixes: 9862b43624a5 ("platform/x86: dell-laptop: Allocate buffer on heap rather than globally")
Signed-off-by: Laszlo Toth <laszlth@gmail.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Mario Limonciello <mario.limonciello@dell.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/dell-laptop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index 2a68f59d2228c..a37cff9fd8d4f 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -1279,7 +1279,7 @@ static int kbd_get_state(struct kbd_state *state)
 	struct calling_interface_buffer buffer;
 	int ret;
 
-	dell_fill_request(&buffer, 0, 0, 0, 0);
+	dell_fill_request(&buffer, 0x1, 0, 0, 0);
 	ret = dell_send_request(&buffer,
 				CLASS_KBD_BACKLIGHT, SELECT_KBD_BACKLIGHT);
 	if (ret)

From c8ba9db2a790c0fcf2f6c4cafd45ff3a0751800e Mon Sep 17 00:00:00 2001
From: Alexander Abrosimov <alexander.n.abrosimov@gmail.com>
Date: Thu, 8 Feb 2018 01:12:26 +0300
Subject: [PATCH 624/676] platform/x86: dell-laptop: Removed duplicates in DMI
 whitelist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed a mistake in which several entries were duplicated in the DMI list
from the below commit
fe486138 platform/x86: dell-laptop: Add 2-in-1 devices to the DMI whitelist

Signed-off-by: Alexander Abrosimov <alexander.n.abrosimov@gmail.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/dell-laptop.c | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c
index a37cff9fd8d4f..c52c6723374b5 100644
--- a/drivers/platform/x86/dell-laptop.c
+++ b/drivers/platform/x86/dell-laptop.c
@@ -126,24 +126,6 @@ static const struct dmi_system_id dell_device_table[] __initconst = {
 			DMI_MATCH(DMI_CHASSIS_TYPE, "32"), /*Detachable*/
 		},
 	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_CHASSIS_TYPE, "30"), /*Tablet*/
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_CHASSIS_TYPE, "31"), /*Convertible*/
-		},
-	},
-	{
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-			DMI_MATCH(DMI_CHASSIS_TYPE, "32"), /*Detachable*/
-		},
-	},
 	{
 		.ident = "Dell Computer Corporation",
 		.matches = {

From 0b7c1528fb741803396da68a9d8d285ff7db731c Mon Sep 17 00:00:00 2001
From: William Cohen <wcohen@redhat.com>
Date: Tue, 30 Jan 2018 22:28:13 -0500
Subject: [PATCH 625/676] perf vendor events aarch64: Add JSON metrics for ARM
 Cortex-A53 Processor

Add JSON metrics for ARM Cortex-A53 Processor.

Unlike the Intel processors there isn't a script that automatically
generated these files. The patch was manually generated from the
documentation and the previous oprofile ARM Cortex ac53 event file patch
I made.

The relevant documentation is in the "12.9 Events" section of the ARM
Cortex A53 MPCore Processor Revision: r0p4 Technical Reference Manual.

The ARM Cortex A53 manual is available at:

  http://infocenter.arm.com/help/topic/com.arm.doc.ddi0500g/DDI0500G_cortex_a53_trm.pdf

Use that to look for additional information about the events.

Signed-off-by: William Cohen <wcohen@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20180131032813.9564-1-wcohen@redhat.com
[ Added references provided by William Cohen ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../arch/arm64/cortex-a53/branch.json         | 27 ++++++++++
 .../pmu-events/arch/arm64/cortex-a53/bus.json | 22 ++++++++
 .../arch/arm64/cortex-a53/cache.json          | 27 ++++++++++
 .../arch/arm64/cortex-a53/memory.json         | 22 ++++++++
 .../arch/arm64/cortex-a53/other.json          | 32 ++++++++++++
 .../arch/arm64/cortex-a53/pipeline.json       | 52 +++++++++++++++++++
 tools/perf/pmu-events/arch/arm64/mapfile.csv  |  1 +
 7 files changed, 183 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/other.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json

diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json
new file mode 100644
index 0000000000000..3b6208763e50c
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json
@@ -0,0 +1,27 @@
+[
+  {,
+    "EventCode": "0x7A",
+    "EventName": "BR_INDIRECT_SPEC",
+    "BriefDescription": "Branch speculatively executed - Indirect branch"
+  },
+  {,
+    "EventCode": "0xC9",
+    "EventName": "BR_COND",
+    "BriefDescription": "Conditional branch executed"
+  },
+  {,
+    "EventCode": "0xCA",
+    "EventName": "BR_INDIRECT_MISPRED",
+    "BriefDescription": "Indirect branch mispredicted"
+  },
+  {,
+    "EventCode": "0xCB",
+    "EventName": "BR_INDIRECT_MISPRED_ADDR",
+    "BriefDescription": "Indirect branch mispredicted because of address miscompare"
+  },
+  {,
+    "EventCode": "0xCC",
+    "EventName": "BR_COND_MISPRED",
+    "BriefDescription": "Conditional branch mispredicted"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json
new file mode 100644
index 0000000000000..480d9f7460ab2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json
@@ -0,0 +1,22 @@
+[
+  {,
+    "EventCode": "0x60",
+    "EventName": "BUS_ACCESS_LD",
+    "BriefDescription": "Bus access - Read"
+  },
+  {,
+    "EventCode": "0x61",
+    "EventName": "BUS_ACCESS_ST",
+    "BriefDescription": "Bus access - Write"
+  },
+  {,
+    "EventCode": "0xC0",
+    "EventName": "EXT_MEM_REQ",
+    "BriefDescription": "External memory request"
+  },
+  {,
+    "EventCode": "0xC1",
+    "EventName": "EXT_MEM_REQ_NC",
+    "BriefDescription": "Non-cacheable external memory request"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json
new file mode 100644
index 0000000000000..11baad6344b9d
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json
@@ -0,0 +1,27 @@
+[
+  {,
+    "EventCode": "0xC2",
+    "EventName": "PREFETCH_LINEFILL",
+    "BriefDescription": "Linefill because of prefetch"
+  },
+  {,
+    "EventCode": "0xC3",
+    "EventName": "PREFETCH_LINEFILL_DROP",
+    "BriefDescription": "Instruction Cache Throttle occurred"
+  },
+  {,
+    "EventCode": "0xC4",
+    "EventName": "READ_ALLOC_ENTER",
+    "BriefDescription": "Entering read allocate mode"
+  },
+  {,
+    "EventCode": "0xC5",
+    "EventName": "READ_ALLOC",
+    "BriefDescription": "Read allocate mode"
+  },
+  {,
+    "EventCode": "0xC8",
+    "EventName": "EXT_SNOOP",
+    "BriefDescription": "SCU Snooped data from another CPU for this CPU"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json
new file mode 100644
index 0000000000000..480d9f7460ab2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json
@@ -0,0 +1,22 @@
+[
+  {,
+    "EventCode": "0x60",
+    "EventName": "BUS_ACCESS_LD",
+    "BriefDescription": "Bus access - Read"
+  },
+  {,
+    "EventCode": "0x61",
+    "EventName": "BUS_ACCESS_ST",
+    "BriefDescription": "Bus access - Write"
+  },
+  {,
+    "EventCode": "0xC0",
+    "EventName": "EXT_MEM_REQ",
+    "BriefDescription": "External memory request"
+  },
+  {,
+    "EventCode": "0xC1",
+    "EventName": "EXT_MEM_REQ_NC",
+    "BriefDescription": "Non-cacheable external memory request"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json
new file mode 100644
index 0000000000000..73a22402d0033
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json
@@ -0,0 +1,32 @@
+[
+  {,
+    "EventCode": "0x86",
+    "EventName": "EXC_IRQ",
+    "BriefDescription": "Exception taken, IRQ"
+  },
+  {,
+    "EventCode": "0x87",
+    "EventName": "EXC_FIQ",
+    "BriefDescription": "Exception taken, FIQ"
+  },
+  {,
+    "EventCode": "0xC6",
+    "EventName": "PRE_DECODE_ERR",
+    "BriefDescription": "Pre-decode error"
+  },
+  {,
+    "EventCode": "0xD0",
+    "EventName": "L1I_CACHE_ERR",
+    "BriefDescription": "L1 Instruction Cache (data or tag) memory error"
+  },
+  {,
+    "EventCode": "0xD1",
+    "EventName": "L1D_CACHE_ERR",
+    "BriefDescription": "L1 Data Cache (data, tag or dirty) memory error, correctable or non-correctable"
+  },
+  {,
+    "EventCode": "0xD2",
+    "EventName": "TLB_ERR",
+    "BriefDescription": "TLB memory error"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json
new file mode 100644
index 0000000000000..3149fb90555a3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json
@@ -0,0 +1,52 @@
+[
+  {,
+    "EventCode": "0xC7",
+    "EventName": "STALL_SB_FULL",
+    "BriefDescription": "Data Write operation that stalls the pipeline because the store buffer is full"
+  },
+  {,
+    "EventCode": "0xE0",
+    "EventName": "OTHER_IQ_DEP_STALL",
+    "BriefDescription": "Cycles that the DPU IQ is empty and that is not because of a recent micro-TLB miss, instruction cache miss or pre-decode error"
+  },
+  {,
+    "EventCode": "0xE1",
+    "EventName": "IC_DEP_STALL",
+    "BriefDescription": "Cycles the DPU IQ is empty and there is an instruction cache miss being processed"
+  },
+  {,
+    "EventCode": "0xE2",
+    "EventName": "IUTLB_DEP_STALL",
+    "BriefDescription": "Cycles the DPU IQ is empty and there is an instruction micro-TLB miss being processed"
+  },
+  {,
+    "EventCode": "0xE3",
+    "EventName": "DECODE_DEP_STALL",
+    "BriefDescription": "Cycles the DPU IQ is empty and there is a pre-decode error being processed"
+  },
+  {,
+    "EventCode": "0xE4",
+    "EventName": "OTHER_INTERLOCK_STALL",
+    "BriefDescription": "Cycles there is an interlock other than  Advanced SIMD/Floating-point instructions or load/store instruction"
+  },
+  {,
+    "EventCode": "0xE5",
+    "EventName": "AGU_DEP_STALL",
+    "BriefDescription": "Cycles there is an interlock for a load/store instruction waiting for data to calculate the address in the AGU"
+  },
+  {,
+    "EventCode": "0xE6",
+    "EventName": "SIMD_DEP_STALL",
+    "BriefDescription": "Cycles there is an interlock for an Advanced SIMD/Floating-point operation."
+  },
+  {,
+    "EventCode": "0xE7",
+    "EventName": "LD_DEP_STALL",
+    "BriefDescription": "Cycles there is a stall in the Wr stage because of a load miss"
+  },
+  {,
+    "EventCode": "0xE8",
+    "EventName": "ST_DEP_STALL",
+    "BriefDescription": "Cycles there is a stall in the Wr stage because of a store"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 219d6756134ee..e61c9ca6cf9e6 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -13,3 +13,4 @@
 #
 #Family-model,Version,Filename,EventType
 0x00000000420f5160,v1,cavium,core
+0x00000000410fd03[[:xdigit:]],v1,cortex-a53,core

From 6888ff66c44ffa3077ed69e978902d0ff4b84ae1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:16 -0800
Subject: [PATCH 626/676] perf evlist: Remove stale mmap read for backward

perf_evlist__mmap_read_catchup() and perf_evlist__mmap_read_backward()
are only for overwrite mode.

But they read the evlist->mmap buffer which is for non-overwrite mode.

It did not bring any serious problem yet, because there is no one use
it.

Remove the unused interfaces.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Wang Nan <wangnan0@huawei.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1516310792-208685-2-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evlist.c | 17 -----------------
 tools/perf/util/evlist.h |  4 ----
 2 files changed, 21 deletions(-)

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ac35cd214feb2..e5fc14e53c051 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -715,28 +715,11 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int
 	return perf_mmap__read_forward(md);
 }
 
-union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx)
-{
-	struct perf_mmap *md = &evlist->mmap[idx];
-
-	/*
-	 * No need to check messup for backward ring buffer:
-	 * We can always read arbitrary long data from a backward
-	 * ring buffer unless we forget to pause it before reading.
-	 */
-	return perf_mmap__read_backward(md);
-}
-
 union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx)
 {
 	return perf_evlist__mmap_read_forward(evlist, idx);
 }
 
-void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx)
-{
-	perf_mmap__read_catchup(&evlist->mmap[idx]);
-}
-
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx)
 {
 	perf_mmap__consume(&evlist->mmap[idx], false);
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 75f8e0ad5d765..336b838e6957e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -133,10 +133,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx);
 
 union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist,
 						 int idx);
-union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist,
-						  int idx);
-void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx);
-
 void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx);
 
 int perf_evlist__open(struct perf_evlist *evlist);

From dc6c35c679e96987dc83a003f30bc2cc33c84c00 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:17 -0800
Subject: [PATCH 627/676] perf mmap: Recalculate size for overwrite mode

In perf_mmap__push(), the 'size' need to be recalculated, otherwise the
invalid data might be pushed to the record in overwrite mode.

The issue is introduced by commit 7fb4b407a124 ("perf mmap: Don't
discard prev in backward mode").

When the ring buffer is full in overwrite mode, backward_rb_find_range()
will be called to recalculate the 'start' and 'end'. The 'size' needs to
be recalculated accordingly.

Unconditionally recalculate the 'size', not just for full ring buffer in
overwrite mode. Because:

- There is no harmful to recalculate the 'size' for other cases.
- The code of calculating 'start' and 'end' will be factored out later.
  The new function does not need to return 'size'.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 7fb4b407a124 ("perf mmap: Don't discard prev in backward mode")
Link: http://lkml.kernel.org/r/1516310792-208685-3-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 05076e6839382..97cf4fab564b4 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -302,6 +302,8 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 			return -1;
 	}
 
+	size = end - start;
+
 	if ((start & md->mask) + size != (end & md->mask)) {
 		buf = &data[start & md->mask];
 		size = md->mask + 1 - (start & md->mask);

From f92c8cbe597a5a2ccec702dff824f3fe0f3623eb Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:18 -0800
Subject: [PATCH 628/676] perf mmap: Cleanup perf_mmap__push()

The first assignment for 'start' and 'end' is redundant.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-4-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 97cf4fab564b4..fbbbe87f03082 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -272,7 +272,7 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 {
 	u64 head = perf_mmap__read_head(md);
 	u64 old = md->prev;
-	u64 end = head, start = old;
+	u64 end, start;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
 	void *buf;

From 8872481bd04850b19e053dc579de5a11b83b16fc Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:19 -0800
Subject: [PATCH 629/676] perf mmap: Introduce perf_mmap__read_init()

The new function perf_mmap__read_init() is factored out from
perf_mmap__push().

It is to calculate the 'start' and 'end' of the available data in
ringbuffer.

No functional change.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-5-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 37 +++++++++++++++++++++++++++----------
 tools/perf/util/mmap.h |  2 ++
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index fbbbe87f03082..c19a4e640e8e8 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -267,24 +267,24 @@ static int overwrite_rb_find_range(void *buf, int mask, u64 head, u64 *start, u6
 	return -1;
 }
 
-int perf_mmap__push(struct perf_mmap *md, bool overwrite,
-		    void *to, int push(void *to, void *buf, size_t size))
+/*
+ * Report the start and end of the available data in ringbuffer
+ */
+int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
+			 u64 *startp, u64 *endp)
 {
 	u64 head = perf_mmap__read_head(md);
 	u64 old = md->prev;
-	u64 end, start;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
-	void *buf;
-	int rc = 0;
 
-	start = overwrite ? head : old;
-	end = overwrite ? old : head;
+	*startp = overwrite ? head : old;
+	*endp = overwrite ? old : head;
 
-	if (start == end)
+	if (*startp == *endp)
 		return 0;
 
-	size = end - start;
+	size = *endp - *startp;
 	if (size > (unsigned long)(md->mask) + 1) {
 		if (!overwrite) {
 			WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n");
@@ -298,10 +298,27 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 		 * Backward ring buffer is full. We still have a chance to read
 		 * most of data from it.
 		 */
-		if (overwrite_rb_find_range(data, md->mask, head, &start, &end))
+		if (overwrite_rb_find_range(data, md->mask, head, startp, endp))
 			return -1;
 	}
 
+	return 1;
+}
+
+int perf_mmap__push(struct perf_mmap *md, bool overwrite,
+		    void *to, int push(void *to, void *buf, size_t size))
+{
+	u64 head = perf_mmap__read_head(md);
+	u64 end, start;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+	void *buf;
+	int rc = 0;
+
+	rc = perf_mmap__read_init(md, overwrite, &start, &end);
+	if (rc < 1)
+		return rc;
+
 	size = end - start;
 
 	if ((start & md->mask) + size != (end & md->mask)) {
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index e43d7b55a55f6..9ab2b48df65be 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -94,4 +94,6 @@ int perf_mmap__push(struct perf_mmap *md, bool backward,
 
 size_t perf_mmap__mmap_len(struct perf_mmap *map);
 
+int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
+			 u64 *startp, u64 *endp);
 #endif /*__PERF_MMAP_H */

From 189f2cc91f9f2efef5d5f4dde43684c01b5f6f2f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:20 -0800
Subject: [PATCH 630/676] perf mmap: Add new return value logic for
 perf_mmap__read_init()

Improve the readability by using meaningful enum (-EAGAIN, -EINVAL and
0) to replace the three returning states (0, -1 and 1).

Suggested-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-6-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index c19a4e640e8e8..38fa69dc635e6 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -282,7 +282,7 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
 	*endp = overwrite ? old : head;
 
 	if (*startp == *endp)
-		return 0;
+		return -EAGAIN;
 
 	size = *endp - *startp;
 	if (size > (unsigned long)(md->mask) + 1) {
@@ -291,7 +291,7 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
 
 			md->prev = head;
 			perf_mmap__consume(md, overwrite);
-			return 0;
+			return -EAGAIN;
 		}
 
 		/*
@@ -299,10 +299,10 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
 		 * most of data from it.
 		 */
 		if (overwrite_rb_find_range(data, md->mask, head, startp, endp))
-			return -1;
+			return -EINVAL;
 	}
 
-	return 1;
+	return 0;
 }
 
 int perf_mmap__push(struct perf_mmap *md, bool overwrite,
@@ -316,8 +316,8 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 	int rc = 0;
 
 	rc = perf_mmap__read_init(md, overwrite, &start, &end);
-	if (rc < 1)
-		return rc;
+	if (rc < 0)
+		return (rc == -EAGAIN) ? 0 : -1;
 
 	size = end - start;
 

From b4b036b4c76341a5034e872aca3727c4988a7304 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:21 -0800
Subject: [PATCH 631/676] perf mmap: Discard 'prev' in perf_mmap__read()

The 'start' and 'prev' variables are duplicates in perf_mmap__read().

Use 'map->prev' to replace 'start' in perf_mmap__read_*().

Suggested-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-7-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 38fa69dc635e6..125bfda9d0375 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -22,29 +22,27 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map)
 
 /* When check_messup is true, 'end' must points to a good entry */
 static union perf_event *perf_mmap__read(struct perf_mmap *map,
-					 u64 start, u64 end, u64 *prev)
+					 u64 *startp, u64 end)
 {
 	unsigned char *data = map->base + page_size;
 	union perf_event *event = NULL;
-	int diff = end - start;
+	int diff = end - *startp;
 
 	if (diff >= (int)sizeof(event->header)) {
 		size_t size;
 
-		event = (union perf_event *)&data[start & map->mask];
+		event = (union perf_event *)&data[*startp & map->mask];
 		size = event->header.size;
 
-		if (size < sizeof(event->header) || diff < (int)size) {
-			event = NULL;
-			goto broken_event;
-		}
+		if (size < sizeof(event->header) || diff < (int)size)
+			return NULL;
 
 		/*
 		 * Event straddles the mmap boundary -- header should always
 		 * be inside due to u64 alignment of output.
 		 */
-		if ((start & map->mask) + size != ((start + size) & map->mask)) {
-			unsigned int offset = start;
+		if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) {
+			unsigned int offset = *startp;
 			unsigned int len = min(sizeof(*event), size), cpy;
 			void *dst = map->event_copy;
 
@@ -59,20 +57,15 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
 			event = (union perf_event *)map->event_copy;
 		}
 
-		start += size;
+		*startp += size;
 	}
 
-broken_event:
-	if (prev)
-		*prev = start;
-
 	return event;
 }
 
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
 {
 	u64 head;
-	u64 old = map->prev;
 
 	/*
 	 * Check if event was unmapped due to a POLLHUP/POLLERR.
@@ -82,13 +75,12 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
 
 	head = perf_mmap__read_head(map);
 
-	return perf_mmap__read(map, old, head, &map->prev);
+	return perf_mmap__read(map, &map->prev, head);
 }
 
 union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
 {
 	u64 head, end;
-	u64 start = map->prev;
 
 	/*
 	 * Check if event was unmapped due to a POLLHUP/POLLERR.
@@ -118,7 +110,7 @@ union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
 	else
 		end = head + map->mask + 1;
 
-	return perf_mmap__read(map, start, end, &map->prev);
+	return perf_mmap__read(map, &map->prev, end);
 }
 
 void perf_mmap__read_catchup(struct perf_mmap *map)

From ee023de05f35484691f7d9e5c1f92195ac4d64d2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:22 -0800
Subject: [PATCH 632/676] perf mmap: Introduce perf_mmap__read_done()

The direction of overwrite mode is backward. The last perf_mmap__read()
will set tail to map->prev. Need to correct the map->prev to head which
is the end of next read.

It will be used later.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-8-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 11 +++++++++++
 tools/perf/util/mmap.h |  1 +
 2 files changed, 12 insertions(+)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 125bfda9d0375..4f59eaefc706b 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -338,3 +338,14 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite,
 out:
 	return rc;
 }
+
+/*
+ * Mandatory for overwrite mode
+ * The direction of overwrite mode is backward.
+ * The last perf_mmap__read() will set tail to map->prev.
+ * Need to correct the map->prev to head which is the end of next read.
+ */
+void perf_mmap__read_done(struct perf_mmap *map)
+{
+	map->prev = perf_mmap__read_head(map);
+}
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 9ab2b48df65be..95549d4af9430 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -96,4 +96,5 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map);
 
 int perf_mmap__read_init(struct perf_mmap *md, bool overwrite,
 			 u64 *startp, u64 *endp);
+void perf_mmap__read_done(struct perf_mmap *map);
 #endif /*__PERF_MMAP_H */

From 7bb45972952db9298fe5cc440160dcad1a66bfbc Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:23 -0800
Subject: [PATCH 633/676] perf mmap: Introduce perf_mmap__read_event()

Except for 'perf record', the other perf tools read events one by one
from the ring buffer using perf_mmap__read_forward(). But it only
supports non-overwrite mode.

Introduce perf_mmap__read_event() to support both non-overwrite and
overwrite mode.

Usage:
perf_mmap__read_init()
while(event = perf_mmap__read_event()) {
        //process the event
        perf_mmap__consume()
}
perf_mmap__read_done()

It cannot use perf_mmap__read_backward(). Because it always reads the
stale buffer which is already processed. Furthermore, the forward and
backward concepts have been removed. The perf_mmap__read_backward() will
be replaced and discarded later.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-9-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 39 +++++++++++++++++++++++++++++++++++++++
 tools/perf/util/mmap.h |  4 ++++
 2 files changed, 43 insertions(+)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index 4f59eaefc706b..f804926778b78 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -113,6 +113,45 @@ union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
 	return perf_mmap__read(map, &map->prev, end);
 }
 
+/*
+ * Read event from ring buffer one by one.
+ * Return one event for each call.
+ *
+ * Usage:
+ * perf_mmap__read_init()
+ * while(event = perf_mmap__read_event()) {
+ *	//process the event
+ *	perf_mmap__consume()
+ * }
+ * perf_mmap__read_done()
+ */
+union perf_event *perf_mmap__read_event(struct perf_mmap *map,
+					bool overwrite,
+					u64 *startp, u64 end)
+{
+	union perf_event *event;
+
+	/*
+	 * Check if event was unmapped due to a POLLHUP/POLLERR.
+	 */
+	if (!refcount_read(&map->refcnt))
+		return NULL;
+
+	if (startp == NULL)
+		return NULL;
+
+	/* non-overwirte doesn't pause the ringbuffer */
+	if (!overwrite)
+		end = perf_mmap__read_head(map);
+
+	event = perf_mmap__read(map, startp, end);
+
+	if (!overwrite)
+		map->prev = *startp;
+
+	return event;
+}
+
 void perf_mmap__read_catchup(struct perf_mmap *map)
 {
 	u64 head;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 95549d4af9430..28718543dd42f 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -89,6 +89,10 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
 union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
 
+union perf_event *perf_mmap__read_event(struct perf_mmap *map,
+					bool overwrite,
+					u64 *startp, u64 end);
+
 int perf_mmap__push(struct perf_mmap *md, bool backward,
 		    void *to, int push(void *to, void *buf, size_t size));
 

From 600a7cfe88de2c6e44e23d61dd721b996b790eb2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:24 -0800
Subject: [PATCH 634/676] perf test: Update mmap read functions for
 backward-ring-buffer test

Use the new perf_mmap__read_* interfaces for overwrite ringbuffer test.

Commiter notes:

Testing:

  [root@seventh ~]# perf test -v backward
  48: Read backward ring buffer                             :
  --- start ---
  test child forked, pid 8309
  Using CPUID GenuineIntel-6-9E
  mmap size 1052672B
  mmap size 8192B
  Finished reading overwrite ring buffer: rewind
  test child finished with 0
  ---- end ----
  Read backward ring buffer: Ok
  [root@seventh ~]#

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-10-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/backward-ring-buffer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c
index 4035d43523c3e..e0b1b414d466b 100644
--- a/tools/perf/tests/backward-ring-buffer.c
+++ b/tools/perf/tests/backward-ring-buffer.c
@@ -31,10 +31,12 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
 	int i;
 
 	for (i = 0; i < evlist->nr_mmaps; i++) {
+		struct perf_mmap *map = &evlist->overwrite_mmap[i];
 		union perf_event *event;
+		u64 start, end;
 
-		perf_mmap__read_catchup(&evlist->overwrite_mmap[i]);
-		while ((event = perf_mmap__read_backward(&evlist->overwrite_mmap[i])) != NULL) {
+		perf_mmap__read_init(map, true, &start, &end);
+		while ((event = perf_mmap__read_event(map, true, &start, end)) != NULL) {
 			const u32 type = event->header.type;
 
 			switch (type) {
@@ -49,6 +51,7 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count,
 				return TEST_FAIL;
 			}
 		}
+		perf_mmap__read_done(map);
 	}
 	return TEST_OK;
 }

From 3effc2f165a842d640873e29d4c5cc1650143aef Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:25 -0800
Subject: [PATCH 635/676] perf mmap: Discard legacy interface for mmap read

Discards perf_mmap__read_backward() and perf_mmap__read_catchup(). No
tools use them.

There are tools still use perf_mmap__read_forward(). Keep it, but add
comments to point to the new interface for future use.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-11-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/mmap.c | 50 ++++--------------------------------------
 tools/perf/util/mmap.h |  3 ---
 2 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index f804926778b78..91531a7c8fbf3 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -63,6 +63,10 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map,
 	return event;
 }
 
+/*
+ * legacy interface for mmap read.
+ * Don't use it. Use perf_mmap__read_event().
+ */
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
 {
 	u64 head;
@@ -78,41 +82,6 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map)
 	return perf_mmap__read(map, &map->prev, head);
 }
 
-union perf_event *perf_mmap__read_backward(struct perf_mmap *map)
-{
-	u64 head, end;
-
-	/*
-	 * Check if event was unmapped due to a POLLHUP/POLLERR.
-	 */
-	if (!refcount_read(&map->refcnt))
-		return NULL;
-
-	head = perf_mmap__read_head(map);
-	if (!head)
-		return NULL;
-
-	/*
-	 * 'head' pointer starts from 0. Kernel minus sizeof(record) form
-	 * it each time when kernel writes to it, so in fact 'head' is
-	 * negative. 'end' pointer is made manually by adding the size of
-	 * the ring buffer to 'head' pointer, means the validate data can
-	 * read is the whole ring buffer. If 'end' is positive, the ring
-	 * buffer has not fully filled, so we must adjust 'end' to 0.
-	 *
-	 * However, since both 'head' and 'end' is unsigned, we can't
-	 * simply compare 'end' against 0. Here we compare '-head' and
-	 * the size of the ring buffer, where -head is the number of bytes
-	 * kernel write to the ring buffer.
-	 */
-	if (-head < (u64)(map->mask + 1))
-		end = 0;
-	else
-		end = head + map->mask + 1;
-
-	return perf_mmap__read(map, &map->prev, end);
-}
-
 /*
  * Read event from ring buffer one by one.
  * Return one event for each call.
@@ -152,17 +121,6 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map,
 	return event;
 }
 
-void perf_mmap__read_catchup(struct perf_mmap *map)
-{
-	u64 head;
-
-	if (!refcount_read(&map->refcnt))
-		return;
-
-	head = perf_mmap__read_head(map);
-	map->prev = head;
-}
-
 static bool perf_mmap__empty(struct perf_mmap *map)
 {
 	return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 28718543dd42f..ec7d3a24e276f 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -65,8 +65,6 @@ void perf_mmap__put(struct perf_mmap *map);
 
 void perf_mmap__consume(struct perf_mmap *map, bool overwrite);
 
-void perf_mmap__read_catchup(struct perf_mmap *md);
-
 static inline u64 perf_mmap__read_head(struct perf_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->base;
@@ -87,7 +85,6 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail)
 }
 
 union perf_event *perf_mmap__read_forward(struct perf_mmap *map);
-union perf_event *perf_mmap__read_backward(struct perf_mmap *map);
 
 union perf_event *perf_mmap__read_event(struct perf_mmap *map,
 					bool overwrite,

From 63878a53cedc3df31bd4ba8740a49fa0fc116ac6 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:26 -0800
Subject: [PATCH 636/676] perf top: Check per-event overwrite term

Per-event overwrite term is not forbidden in 'perf top', which can bring
problems. Because 'perf top' only support non-overwrite mode now.

Add new rules and check regarding to overwrite term for 'perf top'.
- All events either have same per-event term or don't have per-event
  mode setting. Otherwise, it will error out.
- Per-event overwrite term should be consistent as opts->overwrite.
  If not, updating the opts->overwrite according to per-event term.

Make it possible to support either non-overwrite or overwrite mode.
The overwrite mode is forbidden now, which will be removed when the
overwrite mode is supported later.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-12-git-send-email-kan.liang@intel.com
[ Renamed perf_top_overwrite_check to perf_top__overwrite_check, to follow existing convention ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 73 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c6ccda52117d3..17783798924a4 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -881,6 +881,68 @@ static void perf_top__mmap_read(struct perf_top *top)
 		perf_top__mmap_read_idx(top, i);
 }
 
+/*
+ * Check per-event overwrite term.
+ * perf top should support consistent term for all events.
+ * - All events don't have per-event term
+ *   E.g. "cpu/cpu-cycles/,cpu/instructions/"
+ *   Nothing change, return 0.
+ * - All events have same per-event term
+ *   E.g. "cpu/cpu-cycles,no-overwrite/,cpu/instructions,no-overwrite/
+ *   Using the per-event setting to replace the opts->overwrite if
+ *   they are different, then return 0.
+ * - Events have different per-event term
+ *   E.g. "cpu/cpu-cycles,overwrite/,cpu/instructions,no-overwrite/"
+ *   Return -1
+ * - Some of the event set per-event term, but some not.
+ *   E.g. "cpu/cpu-cycles/,cpu/instructions,no-overwrite/"
+ *   Return -1
+ */
+static int perf_top__overwrite_check(struct perf_top *top)
+{
+	struct record_opts *opts = &top->record_opts;
+	struct perf_evlist *evlist = top->evlist;
+	struct perf_evsel_config_term *term;
+	struct list_head *config_terms;
+	struct perf_evsel *evsel;
+	int set, overwrite = -1;
+
+	evlist__for_each_entry(evlist, evsel) {
+		set = -1;
+		config_terms = &evsel->config_terms;
+		list_for_each_entry(term, config_terms, list) {
+			if (term->type == PERF_EVSEL__CONFIG_TERM_OVERWRITE)
+				set = term->val.overwrite ? 1 : 0;
+		}
+
+		/* no term for current and previous event (likely) */
+		if ((overwrite < 0) && (set < 0))
+			continue;
+
+		/* has term for both current and previous event, compare */
+		if ((overwrite >= 0) && (set >= 0) && (overwrite != set))
+			return -1;
+
+		/* no term for current event but has term for previous one */
+		if ((overwrite >= 0) && (set < 0))
+			return -1;
+
+		/* has term for current event */
+		if ((overwrite < 0) && (set >= 0)) {
+			/* if it's first event, set overwrite */
+			if (evsel == perf_evlist__first(evlist))
+				overwrite = set;
+			else
+				return -1;
+		}
+	}
+
+	if ((overwrite >= 0) && (opts->overwrite != overwrite))
+		opts->overwrite = overwrite;
+
+	return 0;
+}
+
 static int perf_top__start_counters(struct perf_top *top)
 {
 	char msg[BUFSIZ];
@@ -888,6 +950,17 @@ static int perf_top__start_counters(struct perf_top *top)
 	struct perf_evlist *evlist = top->evlist;
 	struct record_opts *opts = &top->record_opts;
 
+	if (perf_top__overwrite_check(top)) {
+		ui__error("perf top only support consistent per-event "
+			  "overwrite setting for all events\n");
+		goto out_err;
+	}
+
+	if (opts->overwrite) {
+		ui__error("not support overwrite mode yet\n");
+		goto out_err;
+	}
+
 	perf_evlist__config(evlist, opts, &callchain_param);
 
 	evlist__for_each_entry(evlist, counter) {

From 9a831b3a32c5daf5d7cc672334d51930f78e4ea3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 2 Feb 2018 11:27:25 -0300
Subject: [PATCH 637/676] perf evsel: Expose the perf_missing_features struct

As tools may need to adjust to missing features, as 'perf top' will, in
the next csets, to cope with a missing 'write_backward' feature.

Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: https://lkml.kernel.org/n/tip-jelngl9q1ooaizvkcput9tic@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/evsel.c | 12 +-----------
 tools/perf/util/evsel.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ff359c9ece2e7..ef351688b7979 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -41,17 +41,7 @@
 
 #include "sane_ctype.h"
 
-static struct {
-	bool sample_id_all;
-	bool exclude_guest;
-	bool mmap2;
-	bool cloexec;
-	bool clockid;
-	bool clockid_wrong;
-	bool lbr_flags;
-	bool write_backward;
-	bool group_read;
-} perf_missing_features;
+struct perf_missing_features perf_missing_features;
 
 static clockid_t clockid;
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 846e416445254..a7487c6d18660 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -149,6 +149,20 @@ union u64_swap {
 	u32 val32[2];
 };
 
+struct perf_missing_features {
+	bool sample_id_all;
+	bool exclude_guest;
+	bool mmap2;
+	bool cloexec;
+	bool clockid;
+	bool clockid_wrong;
+	bool lbr_flags;
+	bool write_backward;
+	bool group_read;
+};
+
+extern struct perf_missing_features perf_missing_features;
+
 struct cpu_map;
 struct target;
 struct thread_map;

From 204721d7eabe6ee98aafce791ce3efdbc4715834 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:28 -0800
Subject: [PATCH 638/676] perf top: Add overwrite fall back

Switch to non-overwrite mode if kernel doesnot support overwrite
ringbuffer.

It's only effect when overwrite mode is supported.  No change to current
behavior.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-14-git-send-email-kan.liang@intel.com
[ Use perf_missing_features.write_backward instead of the non merged is_write_backward_fail() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 17783798924a4..ee4bba1e282c2 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -943,6 +943,27 @@ static int perf_top__overwrite_check(struct perf_top *top)
 	return 0;
 }
 
+static int perf_top_overwrite_fallback(struct perf_top *top,
+				       struct perf_evsel *evsel)
+{
+	struct record_opts *opts = &top->record_opts;
+	struct perf_evlist *evlist = top->evlist;
+	struct perf_evsel *counter;
+
+	if (!opts->overwrite)
+		return 0;
+
+	/* only fall back when first event fails */
+	if (evsel != perf_evlist__first(evlist))
+		return 0;
+
+	evlist__for_each_entry(evlist, counter)
+		counter->attr.write_backward = false;
+	opts->overwrite = false;
+	ui__warning("fall back to non-overwrite mode\n");
+	return 1;
+}
+
 static int perf_top__start_counters(struct perf_top *top)
 {
 	char msg[BUFSIZ];
@@ -967,6 +988,21 @@ static int perf_top__start_counters(struct perf_top *top)
 try_again:
 		if (perf_evsel__open(counter, top->evlist->cpus,
 				     top->evlist->threads) < 0) {
+
+			/*
+			 * Specially handle overwrite fall back.
+			 * Because perf top is the only tool which has
+			 * overwrite mode by default, support
+			 * both overwrite and non-overwrite mode, and
+			 * require consistent mode for all events.
+			 *
+			 * May move it to generic code with more tools
+			 * have similar attribute.
+			 */
+			if (perf_missing_features.write_backward &&
+			    perf_top_overwrite_fallback(top, counter))
+				goto try_again;
+
 			if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
 				if (verbose > 0)
 					ui__warning("%s\n", msg);

From 06cc1a470ab237b991901729b125404c164f3660 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:29 -0800
Subject: [PATCH 639/676] perf hists browser: Add parameter to disable lost
 event warning

For overwrite mode, the ringbuffer will be paused. The event lost is
expected. It needs a way to notify the browser not print the warning.

It will be used later for perf top to disable lost event warning in
overwrite mode. There is no behavior change for now.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-15-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-c2c.c       |  4 ++--
 tools/perf/builtin-report.c    |  3 ++-
 tools/perf/builtin-top.c       |  2 +-
 tools/perf/ui/browsers/hists.c | 38 ++++++++++++++++++++++------------
 tools/perf/ui/browsers/hists.h |  3 ++-
 tools/perf/util/hist.h         |  6 ++++--
 6 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index c0815a37fdb5a..539c3d4601586 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2245,7 +2245,7 @@ static int perf_c2c__browse_cacheline(struct hist_entry *he)
 	c2c_browser__update_nr_entries(browser);
 
 	while (1) {
-		key = hist_browser__run(browser, "? - help");
+		key = hist_browser__run(browser, "? - help", true);
 
 		switch (key) {
 		case 's':
@@ -2314,7 +2314,7 @@ static int perf_c2c__hists_browse(struct hists *hists)
 	c2c_browser__update_nr_entries(browser);
 
 	while (1) {
-		key = hist_browser__run(browser, "? - help");
+		key = hist_browser__run(browser, "? - help", true);
 
 		switch (key) {
 		case 'q':
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 42a52dcc41cd4..4ad5dc649716e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -530,7 +530,8 @@ static int report__browse_hists(struct report *rep)
 	case 1:
 		ret = perf_evlist__tui_browse_hists(evlist, help, NULL,
 						    rep->min_percent,
-						    &session->header.env);
+						    &session->header.env,
+						    true);
 		/*
 		 * Usually "ret" is the last pressed key, and we only
 		 * care if the key notifies us to switch data file.
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ee4bba1e282c2..7def861a9ec41 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -611,7 +611,7 @@ static void *display_thread_tui(void *arg)
 
 	perf_evlist__tui_browse_hists(top->evlist, help, &hbt,
 				      top->min_percent,
-				      &top->session->header.env);
+				      &top->session->header.env, true);
 
 	done = 1;
 	return NULL;
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 68146f4620a57..6495ee55d9c38 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -608,7 +608,8 @@ static int hist_browser__title(struct hist_browser *browser, char *bf, size_t si
 	return browser->title ? browser->title(browser, bf, size) : 0;
 }
 
-int hist_browser__run(struct hist_browser *browser, const char *help)
+int hist_browser__run(struct hist_browser *browser, const char *help,
+		      bool warn_lost_event)
 {
 	int key;
 	char title[160];
@@ -638,8 +639,9 @@ int hist_browser__run(struct hist_browser *browser, const char *help)
 			nr_entries = hist_browser__nr_entries(browser);
 			ui_browser__update_nr_entries(&browser->b, nr_entries);
 
-			if (browser->hists->stats.nr_lost_warned !=
-			    browser->hists->stats.nr_events[PERF_RECORD_LOST]) {
+			if (warn_lost_event &&
+			    (browser->hists->stats.nr_lost_warned !=
+			    browser->hists->stats.nr_events[PERF_RECORD_LOST])) {
 				browser->hists->stats.nr_lost_warned =
 					browser->hists->stats.nr_events[PERF_RECORD_LOST];
 				ui_browser__warn_lost_events(&browser->b);
@@ -2763,7 +2765,8 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    bool left_exits,
 				    struct hist_browser_timer *hbt,
 				    float min_pcnt,
-				    struct perf_env *env)
+				    struct perf_env *env,
+				    bool warn_lost_event)
 {
 	struct hists *hists = evsel__hists(evsel);
 	struct hist_browser *browser = perf_evsel_browser__new(evsel, hbt, env);
@@ -2844,7 +2847,8 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 
 		nr_options = 0;
 
-		key = hist_browser__run(browser, helpline);
+		key = hist_browser__run(browser, helpline,
+					warn_lost_event);
 
 		if (browser->he_selection != NULL) {
 			thread = hist_browser__selected_thread(browser);
@@ -3184,7 +3188,8 @@ static void perf_evsel_menu__write(struct ui_browser *browser,
 
 static int perf_evsel_menu__run(struct perf_evsel_menu *menu,
 				int nr_events, const char *help,
-				struct hist_browser_timer *hbt)
+				struct hist_browser_timer *hbt,
+				bool warn_lost_event)
 {
 	struct perf_evlist *evlist = menu->b.priv;
 	struct perf_evsel *pos;
@@ -3203,7 +3208,9 @@ static int perf_evsel_menu__run(struct perf_evsel_menu *menu,
 		case K_TIMER:
 			hbt->timer(hbt->arg);
 
-			if (!menu->lost_events_warned && menu->lost_events) {
+			if (!menu->lost_events_warned &&
+			    menu->lost_events &&
+			    warn_lost_event) {
 				ui_browser__warn_lost_events(&menu->b);
 				menu->lost_events_warned = true;
 			}
@@ -3224,7 +3231,8 @@ static int perf_evsel_menu__run(struct perf_evsel_menu *menu,
 			key = perf_evsel__hists_browse(pos, nr_events, help,
 						       true, hbt,
 						       menu->min_pcnt,
-						       menu->env);
+						       menu->env,
+						       warn_lost_event);
 			ui_browser__show_title(&menu->b, title);
 			switch (key) {
 			case K_TAB:
@@ -3282,7 +3290,8 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
 					   int nr_entries, const char *help,
 					   struct hist_browser_timer *hbt,
 					   float min_pcnt,
-					   struct perf_env *env)
+					   struct perf_env *env,
+					   bool warn_lost_event)
 {
 	struct perf_evsel *pos;
 	struct perf_evsel_menu menu = {
@@ -3309,13 +3318,15 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist,
 			menu.b.width = line_len;
 	}
 
-	return perf_evsel_menu__run(&menu, nr_entries, help, hbt);
+	return perf_evsel_menu__run(&menu, nr_entries, help,
+				    hbt, warn_lost_event);
 }
 
 int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
 				  struct hist_browser_timer *hbt,
 				  float min_pcnt,
-				  struct perf_env *env)
+				  struct perf_env *env,
+				  bool warn_lost_event)
 {
 	int nr_entries = evlist->nr_entries;
 
@@ -3325,7 +3336,7 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
 
 		return perf_evsel__hists_browse(first, nr_entries, help,
 						false, hbt, min_pcnt,
-						env);
+						env, warn_lost_event);
 	}
 
 	if (symbol_conf.event_group) {
@@ -3342,5 +3353,6 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
 	}
 
 	return __perf_evlist__tui_browse_hists(evlist, nr_entries, help,
-					       hbt, min_pcnt, env);
+					       hbt, min_pcnt, env,
+					       warn_lost_event);
 }
diff --git a/tools/perf/ui/browsers/hists.h b/tools/perf/ui/browsers/hists.h
index ba431777f5590..9428bee076f24 100644
--- a/tools/perf/ui/browsers/hists.h
+++ b/tools/perf/ui/browsers/hists.h
@@ -28,7 +28,8 @@ struct hist_browser {
 
 struct hist_browser *hist_browser__new(struct hists *hists);
 void hist_browser__delete(struct hist_browser *browser);
-int hist_browser__run(struct hist_browser *browser, const char *help);
+int hist_browser__run(struct hist_browser *browser, const char *help,
+		      bool warn_lost_event);
 void hist_browser__init(struct hist_browser *browser,
 			struct hists *hists);
 #endif /* _PERF_UI_BROWSER_HISTS_H_ */
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index f6630cb95effc..02721b5797464 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -430,7 +430,8 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel,
 int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help,
 				  struct hist_browser_timer *hbt,
 				  float min_pcnt,
-				  struct perf_env *env);
+				  struct perf_env *env,
+				  bool warn_lost_event);
 int script_browse(const char *script_opt);
 #else
 static inline
@@ -438,7 +439,8 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused,
 				  const char *help __maybe_unused,
 				  struct hist_browser_timer *hbt __maybe_unused,
 				  float min_pcnt __maybe_unused,
-				  struct perf_env *env __maybe_unused)
+				  struct perf_env *env __maybe_unused,
+				  bool warn_lost_event __maybe_unused)
 {
 	return 0;
 }

From a1ff5b05e988ca3620027148cd61013408ea4194 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:30 -0800
Subject: [PATCH 640/676] perf top: Remove lost events checking

There would be some records lost in overwrite mode because of pausing
the ringbuffer. It has little impact for the accuracy of the snapshot
and could be tolerated by 'perf top'.

Remove the lost events checking.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-16-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 7def861a9ec41..59653062bb48a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -283,8 +283,9 @@ static void perf_top__print_sym_table(struct perf_top *top)
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
-	if (hists->stats.nr_lost_warned !=
-	    hists->stats.nr_events[PERF_RECORD_LOST]) {
+	if (!top->record_opts.overwrite &&
+	    (hists->stats.nr_lost_warned !=
+	    hists->stats.nr_events[PERF_RECORD_LOST])) {
 		hists->stats.nr_lost_warned =
 			      hists->stats.nr_events[PERF_RECORD_LOST];
 		color_fprintf(stdout, PERF_COLOR_RED,
@@ -611,7 +612,8 @@ static void *display_thread_tui(void *arg)
 
 	perf_evlist__tui_browse_hists(top->evlist, help, &hbt,
 				      top->min_percent,
-				      &top->session->header.env, true);
+				      &top->session->header.env,
+				      !top->record_opts.overwrite);
 
 	done = 1;
 	return NULL;

From ebebbf082357f86cc84a4d46ce897a5750e41b7a Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:31 -0800
Subject: [PATCH 641/676] perf top: Switch default mode to overwrite mode

perf_top__mmap_read() has a severe performance issue in the Knights
Landing/Mill platform, when monitoring heavy load systems. It costs
several minutes to finish, which is unacceptable.

Currently, 'perf top' uses the non overwrite mode. For non overwrite
mode, it tries to read everything in the ringbuffer and doesn't pause
it. Once there are lots of samples delivered persistently, the
processing time could be very long. Also, the latest samples could be
lost when the ringbuffer is full.

For overwrite mode, it takes a snapshot for the system by pausing the
ringbuffer, which could significantly reduce the processing time.  Also,
the overwrite mode always keep the latest samples.  Considering the real
time requirement for 'perf top', the overwrite mode is more suitable for
it.

Actually, 'perf top' was overwrite mode. It is changed to non overwrite
mode since commit 93fc64f14472 ("perf top: Switch to non overwrite
mode"). It's better to change it back to overwrite mode by default.

For the kernel which doesn't support overwrite mode, it will fall back
to non overwrite mode.

There would be some records lost in overwrite mode because of pausing
the ringbuffer. It has little impact for the accuracy of the snapshot
and can be tolerated.

For overwrite mode, unconditionally wait 100 ms before each snapshot. It
also reduces the overhead caused by pausing ringbuffer, especially on
light load system.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-17-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 59653062bb48a..2b4914f34ed6a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -809,15 +809,23 @@ static void perf_event__process_sample(struct perf_tool *tool,
 
 static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 {
+	struct record_opts *opts = &top->record_opts;
+	struct perf_evlist *evlist = top->evlist;
 	struct perf_sample sample;
 	struct perf_evsel *evsel;
+	struct perf_mmap *md;
 	struct perf_session *session = top->session;
 	union perf_event *event;
 	struct machine *machine;
+	u64 end, start;
 	int ret;
 
-	while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) {
-		ret = perf_evlist__parse_sample(top->evlist, event, &sample);
+	md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx];
+	if (perf_mmap__read_init(md, opts->overwrite, &start, &end) < 0)
+		return;
+
+	while ((event = perf_mmap__read_event(md, opts->overwrite, &start, end)) != NULL) {
+		ret = perf_evlist__parse_sample(evlist, event, &sample);
 		if (ret) {
 			pr_err("Can't parse sample, err = %d\n", ret);
 			goto next_event;
@@ -871,16 +879,28 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 		} else
 			++session->evlist->stats.nr_unknown_events;
 next_event:
-		perf_evlist__mmap_consume(top->evlist, idx);
+		perf_mmap__consume(md, opts->overwrite);
 	}
+
+	perf_mmap__read_done(md);
 }
 
 static void perf_top__mmap_read(struct perf_top *top)
 {
+	bool overwrite = top->record_opts.overwrite;
+	struct perf_evlist *evlist = top->evlist;
 	int i;
 
+	if (overwrite)
+		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING);
+
 	for (i = 0; i < top->evlist->nr_mmaps; i++)
 		perf_top__mmap_read_idx(top, i);
+
+	if (overwrite) {
+		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
+		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING);
+	}
 }
 
 /*
@@ -979,11 +999,6 @@ static int perf_top__start_counters(struct perf_top *top)
 		goto out_err;
 	}
 
-	if (opts->overwrite) {
-		ui__error("not support overwrite mode yet\n");
-		goto out_err;
-	}
-
 	perf_evlist__config(evlist, opts, &callchain_param);
 
 	evlist__for_each_entry(evlist, counter) {
@@ -1144,7 +1159,7 @@ static int __cmd_top(struct perf_top *top)
 
 		perf_top__mmap_read(top);
 
-		if (hits == top->samples)
+		if (opts->overwrite || (hits == top->samples))
 			ret = perf_evlist__poll(top->evlist, 100);
 
 		if (resize) {
@@ -1238,6 +1253,7 @@ int cmd_top(int argc, const char **argv)
 				.uses_mmap   = true,
 			},
 			.proc_map_timeout    = 500,
+			.overwrite	= 1,
 		},
 		.max_stack	     = sysctl_perf_event_max_stack,
 		.sym_pcnt_filter     = 5,

From 8cc42de736b617827a4e7664fb8d7a325bc125bc Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 18 Jan 2018 13:26:32 -0800
Subject: [PATCH 642/676] perf top: Check the latency of perf_top__mmap_read()

The latency of perf_top__mmap_read() should be lower than refresh time.
If not, give some hints to reduce the latency.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1516310792-208685-18-git-send-email-kan.liang@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-top.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 2b4914f34ed6a..b7c823ba8374f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -889,8 +889,10 @@ static void perf_top__mmap_read(struct perf_top *top)
 {
 	bool overwrite = top->record_opts.overwrite;
 	struct perf_evlist *evlist = top->evlist;
+	unsigned long long start, end;
 	int i;
 
+	start = rdclock();
 	if (overwrite)
 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING);
 
@@ -901,6 +903,13 @@ static void perf_top__mmap_read(struct perf_top *top)
 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING);
 	}
+	end = rdclock();
+
+	if ((end - start) > (unsigned long long)top->delay_secs * NSEC_PER_SEC)
+		ui__warning("Too slow to read ring buffer.\n"
+			    "Please try increasing the period (-c) or\n"
+			    "decreasing the freq (-F) or\n"
+			    "limiting the number of CPUs (-C)\n");
 }
 
 /*

From 6677d26c8befa462eab9be6c5335a939011e7e65 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Jan 2018 15:03:59 +0200
Subject: [PATCH 643/676] perf tools: Substitute yet another strtoull()

Instead of home grown function let's use what library provides us.

Signed-off-by: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/20180129130359.1490-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/util.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 443892dabedbe..1019bbc5dbd8a 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -340,35 +340,15 @@ size_t hex_width(u64 v)
 	return n;
 }
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
 /*
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
 int hex2u64(const char *ptr, u64 *long_val)
 {
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
+	char *p;
 
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
+	*long_val = strtoull(ptr, &p, 16);
 
 	return p - ptr;
 }

From ba7e851642f48002def3450b279598c187721fd0 Mon Sep 17 00:00:00 2001
From: Sangwon Hong <qpakzk@gmail.com>
Date: Mon, 5 Feb 2018 20:48:35 +0900
Subject: [PATCH 644/676] perf data: Document missing --force option

Add the --force option to the man page.

Signed-off-by: Sangwon Hong <qpakzk@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Taeung Song <treeze.taeung@gmail.com>
Link: http://lkml.kernel.org/r/1517831315-31490-1-git-send-email-qpakzk@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-data.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt
index f0796a47dfa30..90bb4aabe4f8d 100644
--- a/tools/perf/Documentation/perf-data.txt
+++ b/tools/perf/Documentation/perf-data.txt
@@ -30,6 +30,10 @@ OPTIONS for 'convert'
 -i::
 	Specify input perf data file path.
 
+-f::
+--force::
+	Don't complain, do it.
+
 -v::
 --verbose::
         Be more verbose (show counter open errors, etc).

From 7a92453620d42c3a5fea94a864dc6aa04c262b93 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.vnet.ibm.com>
Date: Wed, 17 Jan 2018 09:38:31 +0100
Subject: [PATCH 645/676] perf test: Fix test trace+probe_libc_inet_pton.sh for
 s390x

On Intel test case trace+probe_libc_inet_pton.sh succeeds and the
output is:

[root@f27 perf]# ./perf trace --no-syscalls
                  -e probe_libc:inet_pton/max-stack=3/ ping -6 -c 1 ::1
PING ::1(::1) 56 data bytes
64 bytes from ::1: icmp_seq=1 ttl=64 time=0.037 ms

 --- ::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.037/0.037/0.037/0.000 ms
     0.000 probe_libc:inet_pton:(7fa40ac618a0))
              __GI___inet_pton (/usr/lib64/libc-2.26.so)
              getaddrinfo (/usr/lib64/libc-2.26.so)
              main (/usr/bin/ping)

The kernel stack unwinder is used, it is specified implicitly
as call-graph=fp (frame pointer).

On s390x only dwarf is available for stack unwinding. It is also
done in user space. This requires different parameter setup
and result checking for s390x and Intel.

This patch adds separate perf trace setup and result checking
for Intel and s390x. On s390x specify this command line to
get a call-graph and handle the different call graph result
checking:

[root@s35lp76 perf]# ./perf trace --no-syscalls
	-e probe_libc:inet_pton/call-graph=dwarf/ ping -6 -c 1 ::1
PING ::1(::1) 56 data bytes
64 bytes from ::1: icmp_seq=1 ttl=64 time=0.041 ms

 --- ::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.041/0.041/0.041/0.000 ms
     0.000 probe_libc:inet_pton:(3ffb9942060))
            __GI___inet_pton (/usr/lib64/libc-2.26.so)
            gaih_inet (inlined)
            __GI_getaddrinfo (inlined)
            main (/usr/bin/ping)
            __libc_start_main (/usr/lib64/libc-2.26.so)
            _start (/usr/bin/ping)
[root@s35lp76 perf]#

Before:
[root@s8360047 perf]# ./perf test -vv 58
58: probe libc's inet_pton & backtrace it with ping       :
 --- start ---
test child forked, pid 26349
PING ::1(::1) 56 data bytes
64 bytes from ::1: icmp_seq=1 ttl=64 time=0.079 ms
 --- ::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.079/0.079/0.079/0.000 ms
0.000 probe_libc:inet_pton:(3ff925c2060))
test child finished with -1
 ---- end ----
probe libc's inet_pton & backtrace it with ping: FAILED!
[root@s8360047 perf]#

After:
[root@s35lp76 perf]# ./perf test -vv 57
57: probe libc's inet_pton & backtrace it with ping       :
 --- start ---
test child forked, pid 38708
PING ::1(::1) 56 data bytes
64 bytes from ::1: icmp_seq=1 ttl=64 time=0.038 ms
 --- ::1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.038/0.038/0.038/0.000 ms
0.000 probe_libc:inet_pton:(3ff87342060))
__GI___inet_pton (/usr/lib64/libc-2.26.so)
gaih_inet (inlined)
__GI_getaddrinfo (inlined)
main (/usr/bin/ping)
__libc_start_main (/usr/lib64/libc-2.26.so)
_start (/usr/bin/ping)
test child finished with 0
 ---- end ----
probe libc's inet_pton & backtrace it with ping: Ok
[root@s35lp76 perf]#

On Intel the test case runs unchanged and succeeds.

Signed-off-by: Thomas Richter <tmricht@linux.vnet.ibm.com>
Reviewed-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Link: http://lkml.kernel.org/r/20180117083831.101001-1-tmricht@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../tests/shell/trace+probe_libc_inet_pton.sh | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh
index 8b3da21a08f19..c446c894b2973 100755
--- a/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh
@@ -22,10 +22,23 @@ trace_libc_inet_pton_backtrace() {
 	expected[4]="rtt min.*"
 	expected[5]="[0-9]+\.[0-9]+[[:space:]]+probe_libc:inet_pton:\([[:xdigit:]]+\)"
 	expected[6]=".*inet_pton[[:space:]]\($libc\)$"
-	expected[7]="getaddrinfo[[:space:]]\($libc\)$"
-	expected[8]=".*\(.*/bin/ping.*\)$"
-
-	perf trace --no-syscalls -e probe_libc:inet_pton/max-stack=3/ ping -6 -c 1 ::1 2>&1 | grep -v ^$ | while read line ; do
+	case "$(uname -m)" in
+	s390x)
+		eventattr='call-graph=dwarf'
+		expected[7]="gaih_inet[[:space:]]\(inlined\)$"
+		expected[8]="__GI_getaddrinfo[[:space:]]\(inlined\)$"
+		expected[9]="main[[:space:]]\(.*/bin/ping.*\)$"
+		expected[10]="__libc_start_main[[:space:]]\($libc\)$"
+		expected[11]="_start[[:space:]]\(.*/bin/ping.*\)$"
+		;;
+	*)
+		eventattr='max-stack=3'
+		expected[7]="getaddrinfo[[:space:]]\($libc\)$"
+		expected[8]=".*\(.*/bin/ping.*\)$"
+		;;
+	esac
+
+	perf trace --no-syscalls -e probe_libc:inet_pton/$eventattr/ ping -6 -c 1 ::1 2>&1 | grep -v ^$ | while read line ; do
 		echo $line
 		echo "$line" | egrep -q "${expected[$idx]}"
 		if [ $? -ne 0 ] ; then
@@ -33,7 +46,7 @@ trace_libc_inet_pton_backtrace() {
 			exit 1
 		fi
 		let idx+=1
-		[ $idx -eq 9 ] && break
+		[ -z "${expected[$idx]}" ] && break
 	done
 }
 

From f091f1d6a2b4840c9b631d6138f5354401347863 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 13 Feb 2018 12:54:58 +0100
Subject: [PATCH 646/676] tools/headers: Synchronize kernel ABI headers,
 v4.16-rc1

Sync the following tooling headers with the latest kernel version:

  tools/arch/powerpc/include/uapi/asm/kvm.h
  tools/arch/x86/include/asm/cpufeatures.h
  tools/include/uapi/drm/i915_drm.h
  tools/include/uapi/linux/if_link.h
  tools/include/uapi/linux/kvm.h

All the changes are new ABI additions which don't impact their use
in existing tooling.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 tools/arch/powerpc/include/uapi/asm/kvm.h |  2 +
 tools/arch/x86/include/asm/cpufeatures.h  |  1 +
 tools/include/uapi/drm/i915_drm.h         | 77 +++++++++++++++++++
 tools/include/uapi/linux/if_link.h        |  1 +
 tools/include/uapi/linux/kvm.h            | 90 +++++++++++++++++++++++
 5 files changed, 171 insertions(+)

diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 637b7263cb867..833ed9a16adfd 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -632,6 +632,8 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
 #define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
 
+#define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 1d9199e1c2ad4..0dfe4d3f74e24 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -210,6 +210,7 @@
 
 #define X86_FEATURE_MBA			( 7*32+18) /* Memory Bandwidth Allocation */
 #define X86_FEATURE_RSB_CTXSW		( 7*32+19) /* "" Fill RSB on context switches */
+#define X86_FEATURE_SEV			( 7*32+20) /* AMD Secure Encrypted Virtualization */
 
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h
index ac3c6503ca27f..536ee4febd746 100644
--- a/tools/include/uapi/drm/i915_drm.h
+++ b/tools/include/uapi/drm/i915_drm.h
@@ -86,6 +86,62 @@ enum i915_mocs_table_index {
 	I915_MOCS_CACHED,
 };
 
+/*
+ * Different engines serve different roles, and there may be more than one
+ * engine serving each role. enum drm_i915_gem_engine_class provides a
+ * classification of the role of the engine, which may be used when requesting
+ * operations to be performed on a certain subset of engines, or for providing
+ * information about that group.
+ */
+enum drm_i915_gem_engine_class {
+	I915_ENGINE_CLASS_RENDER	= 0,
+	I915_ENGINE_CLASS_COPY		= 1,
+	I915_ENGINE_CLASS_VIDEO		= 2,
+	I915_ENGINE_CLASS_VIDEO_ENHANCE	= 3,
+
+	I915_ENGINE_CLASS_INVALID	= -1
+};
+
+/**
+ * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
+ *
+ */
+
+enum drm_i915_pmu_engine_sample {
+	I915_SAMPLE_BUSY = 0,
+	I915_SAMPLE_WAIT = 1,
+	I915_SAMPLE_SEMA = 2
+};
+
+#define I915_PMU_SAMPLE_BITS (4)
+#define I915_PMU_SAMPLE_MASK (0xf)
+#define I915_PMU_SAMPLE_INSTANCE_BITS (8)
+#define I915_PMU_CLASS_SHIFT \
+	(I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS)
+
+#define __I915_PMU_ENGINE(class, instance, sample) \
+	((class) << I915_PMU_CLASS_SHIFT | \
+	(instance) << I915_PMU_SAMPLE_BITS | \
+	(sample))
+
+#define I915_PMU_ENGINE_BUSY(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
+
+#define I915_PMU_ENGINE_WAIT(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
+
+#define I915_PMU_ENGINE_SEMA(class, instance) \
+	__I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
+
+#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
+
+#define I915_PMU_ACTUAL_FREQUENCY	__I915_PMU_OTHER(0)
+#define I915_PMU_REQUESTED_FREQUENCY	__I915_PMU_OTHER(1)
+#define I915_PMU_INTERRUPTS		__I915_PMU_OTHER(2)
+#define I915_PMU_RC6_RESIDENCY		__I915_PMU_OTHER(3)
+
+#define I915_PMU_LAST I915_PMU_RC6_RESIDENCY
+
 /* Each region is a minimum of 16k, and there are at most 255 of them.
  */
 #define I915_NR_TEX_REGIONS 255	/* table size 2k - maximum due to use
@@ -450,6 +506,27 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_FENCE_ARRAY  49
 
+/*
+ * Query whether every context (both per-file default and user created) is
+ * isolated (insofar as HW supports). If this parameter is not true, then
+ * freshly created contexts may inherit values from an existing context,
+ * rather than default HW values. If true, it also ensures (insofar as HW
+ * supports) that all state set by this context will not leak to any other
+ * context.
+ *
+ * As not every engine across every gen support contexts, the returned
+ * value reports the support of context isolation for individual engines by
+ * returning a bitmask of each engine class set to true if that class supports
+ * isolation.
+ */
+#define I915_PARAM_HAS_CONTEXT_ISOLATION 50
+
+/* Frequency of the command streamer timestamps given by the *_TIMESTAMP
+ * registers. This used to be fixed per platform but from CNL onwards, this
+ * might vary depending on the parts.
+ */
+#define I915_PARAM_CS_TIMESTAMP_FREQUENCY 51
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 8616131e2c61d..6d9447700e18c 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -163,6 +163,7 @@ enum {
 	IFLA_IF_NETNSID,
 	IFLA_CARRIER_UP_COUNT,
 	IFLA_CARRIER_DOWN_COUNT,
+	IFLA_NEW_IFINDEX,
 	__IFLA_MAX
 };
 
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index 8fb90a0819c39..0fb5ef9397325 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1362,6 +1362,96 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_S390_CMMA_MIGRATION */
 #define KVM_S390_GET_CMMA_BITS      _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log)
 #define KVM_S390_SET_CMMA_BITS      _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log)
+/* Memory Encryption Commands */
+#define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
+
+struct kvm_enc_region {
+	__u64 addr;
+	__u64 size;
+};
+
+#define KVM_MEMORY_ENCRYPT_REG_REGION    _IOR(KVMIO, 0xbb, struct kvm_enc_region)
+#define KVM_MEMORY_ENCRYPT_UNREG_REGION  _IOR(KVMIO, 0xbc, struct kvm_enc_region)
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+	/* Guest initialization commands */
+	KVM_SEV_INIT = 0,
+	KVM_SEV_ES_INIT,
+	/* Guest launch commands */
+	KVM_SEV_LAUNCH_START,
+	KVM_SEV_LAUNCH_UPDATE_DATA,
+	KVM_SEV_LAUNCH_UPDATE_VMSA,
+	KVM_SEV_LAUNCH_SECRET,
+	KVM_SEV_LAUNCH_MEASURE,
+	KVM_SEV_LAUNCH_FINISH,
+	/* Guest migration commands (outgoing) */
+	KVM_SEV_SEND_START,
+	KVM_SEV_SEND_UPDATE_DATA,
+	KVM_SEV_SEND_UPDATE_VMSA,
+	KVM_SEV_SEND_FINISH,
+	/* Guest migration commands (incoming) */
+	KVM_SEV_RECEIVE_START,
+	KVM_SEV_RECEIVE_UPDATE_DATA,
+	KVM_SEV_RECEIVE_UPDATE_VMSA,
+	KVM_SEV_RECEIVE_FINISH,
+	/* Guest status and debug commands */
+	KVM_SEV_GUEST_STATUS,
+	KVM_SEV_DBG_DECRYPT,
+	KVM_SEV_DBG_ENCRYPT,
+	/* Guest certificates commands */
+	KVM_SEV_CERT_EXPORT,
+
+	KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+	__u32 id;
+	__u64 data;
+	__u32 error;
+	__u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+	__u32 handle;
+	__u32 policy;
+	__u64 dh_uaddr;
+	__u32 dh_len;
+	__u64 session_uaddr;
+	__u32 session_len;
+};
+
+struct kvm_sev_launch_update_data {
+	__u64 uaddr;
+	__u32 len;
+};
+
+
+struct kvm_sev_launch_secret {
+	__u64 hdr_uaddr;
+	__u32 hdr_len;
+	__u64 guest_uaddr;
+	__u32 guest_len;
+	__u64 trans_uaddr;
+	__u32 trans_len;
+};
+
+struct kvm_sev_launch_measure {
+	__u64 uaddr;
+	__u32 len;
+};
+
+struct kvm_sev_guest_status {
+	__u32 handle;
+	__u32 policy;
+	__u32 state;
+};
+
+struct kvm_sev_dbg {
+	__u64 src_uaddr;
+	__u64 dst_uaddr;
+	__u32 len;
+};
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)

From baa676103037e0dd145bb905eb51bc0b2f48fd49 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Thu, 8 Feb 2018 12:47:49 +0100
Subject: [PATCH 647/676] perf s390: Grab a copy of
 arch/s390/kernel/syscall/syscall.tbl

Grab a copy of the s390 system call table file introduced with commit
857f46bfb07f53dc112d69bdfb137cc5ec3da7c5 "s390/syscalls: add system call
table".

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: linux-s390@vger.kernel.org
LPU-Reference: 1518090470-2899-3-git-send-email-brueckner@linux.vnet.ibm.com
Link: https://lkml.kernel.org/n/tip-hpw7vdjp7g92ivgpddrp5ydq@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 .../perf/arch/s390/entry/syscalls/syscall.tbl | 390 ++++++++++++++++++
 1 file changed, 390 insertions(+)
 create mode 100644 tools/perf/arch/s390/entry/syscalls/syscall.tbl

diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
new file mode 100644
index 0000000000000..b38d48464368d
--- /dev/null
+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl
@@ -0,0 +1,390 @@
+# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#
+# System call table for s390
+#
+# Format:
+#
+# <nr> <abi> <syscall> <entry-64bit> <compat-entry>
+#
+# where <abi> can be common, 64, or 32
+
+1    common	exit			sys_exit			sys_exit
+2    common	fork			sys_fork			sys_fork
+3    common	read			sys_read			compat_sys_s390_read
+4    common	write			sys_write			compat_sys_s390_write
+5    common	open			sys_open			compat_sys_open
+6    common	close			sys_close			sys_close
+7    common	restart_syscall		sys_restart_syscall		sys_restart_syscall
+8    common	creat			sys_creat			compat_sys_creat
+9    common	link			sys_link			compat_sys_link
+10   common	unlink			sys_unlink			compat_sys_unlink
+11   common	execve			sys_execve			compat_sys_execve
+12   common	chdir			sys_chdir			compat_sys_chdir
+13   32		time			-				compat_sys_time
+14   common	mknod			sys_mknod			compat_sys_mknod
+15   common	chmod			sys_chmod			compat_sys_chmod
+16   32		lchown			-				compat_sys_s390_lchown16
+19   common	lseek			sys_lseek			compat_sys_lseek
+20   common	getpid			sys_getpid			sys_getpid
+21   common	mount			sys_mount			compat_sys_mount
+22   common	umount			sys_oldumount			compat_sys_oldumount
+23   32		setuid			-				compat_sys_s390_setuid16
+24   32		getuid			-				compat_sys_s390_getuid16
+25   32		stime			-				compat_sys_stime
+26   common	ptrace			sys_ptrace			compat_sys_ptrace
+27   common	alarm			sys_alarm			sys_alarm
+29   common	pause			sys_pause			sys_pause
+30   common	utime			sys_utime			compat_sys_utime
+33   common	access			sys_access			compat_sys_access
+34   common	nice			sys_nice			sys_nice
+36   common	sync			sys_sync			sys_sync
+37   common	kill			sys_kill			sys_kill
+38   common	rename			sys_rename			compat_sys_rename
+39   common	mkdir			sys_mkdir			compat_sys_mkdir
+40   common	rmdir			sys_rmdir			compat_sys_rmdir
+41   common	dup			sys_dup				sys_dup
+42   common	pipe			sys_pipe			compat_sys_pipe
+43   common	times			sys_times			compat_sys_times
+45   common	brk			sys_brk				compat_sys_brk
+46   32		setgid			-				compat_sys_s390_setgid16
+47   32		getgid			-				compat_sys_s390_getgid16
+48   common	signal			sys_signal			compat_sys_signal
+49   32		geteuid			-				compat_sys_s390_geteuid16
+50   32		getegid			-				compat_sys_s390_getegid16
+51   common	acct			sys_acct			compat_sys_acct
+52   common	umount2			sys_umount			compat_sys_umount
+54   common	ioctl			sys_ioctl			compat_sys_ioctl
+55   common	fcntl			sys_fcntl			compat_sys_fcntl
+57   common	setpgid			sys_setpgid			sys_setpgid
+60   common	umask			sys_umask			sys_umask
+61   common	chroot			sys_chroot			compat_sys_chroot
+62   common	ustat			sys_ustat			compat_sys_ustat
+63   common	dup2			sys_dup2			sys_dup2
+64   common	getppid			sys_getppid			sys_getppid
+65   common	getpgrp			sys_getpgrp			sys_getpgrp
+66   common	setsid			sys_setsid			sys_setsid
+67   common	sigaction		sys_sigaction			compat_sys_sigaction
+70   32		setreuid		-				compat_sys_s390_setreuid16
+71   32		setregid		-				compat_sys_s390_setregid16
+72   common	sigsuspend		sys_sigsuspend			compat_sys_sigsuspend
+73   common	sigpending		sys_sigpending			compat_sys_sigpending
+74   common	sethostname		sys_sethostname			compat_sys_sethostname
+75   common	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76   32		getrlimit		-				compat_sys_old_getrlimit
+77   common	getrusage		sys_getrusage			compat_sys_getrusage
+78   common	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79   common	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80   32		getgroups		-				compat_sys_s390_getgroups16
+81   32		setgroups		-				compat_sys_s390_setgroups16
+83   common	symlink			sys_symlink			compat_sys_symlink
+85   common	readlink		sys_readlink			compat_sys_readlink
+86   common	uselib			sys_uselib			compat_sys_uselib
+87   common	swapon			sys_swapon			compat_sys_swapon
+88   common	reboot			sys_reboot			compat_sys_reboot
+89   common	readdir			-				compat_sys_old_readdir
+90   common	mmap			sys_old_mmap			compat_sys_s390_old_mmap
+91   common	munmap			sys_munmap			compat_sys_munmap
+92   common	truncate		sys_truncate			compat_sys_truncate
+93   common	ftruncate		sys_ftruncate			compat_sys_ftruncate
+94   common	fchmod			sys_fchmod			sys_fchmod
+95   32		fchown			-				compat_sys_s390_fchown16
+96   common	getpriority		sys_getpriority			sys_getpriority
+97   common	setpriority		sys_setpriority			sys_setpriority
+99   common	statfs			sys_statfs			compat_sys_statfs
+100  common	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101  32		ioperm			-				-
+102  common	socketcall		sys_socketcall			compat_sys_socketcall
+103  common	syslog			sys_syslog			compat_sys_syslog
+104  common	setitimer		sys_setitimer			compat_sys_setitimer
+105  common	getitimer		sys_getitimer			compat_sys_getitimer
+106  common	stat			sys_newstat			compat_sys_newstat
+107  common	lstat			sys_newlstat			compat_sys_newlstat
+108  common	fstat			sys_newfstat			compat_sys_newfstat
+110  common	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
+111  common	vhangup			sys_vhangup			sys_vhangup
+112  common	idle			-				-
+114  common	wait4			sys_wait4			compat_sys_wait4
+115  common	swapoff			sys_swapoff			compat_sys_swapoff
+116  common	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117  common	ipc			sys_s390_ipc			compat_sys_s390_ipc
+118  common	fsync			sys_fsync			sys_fsync
+119  common	sigreturn		sys_sigreturn			compat_sys_sigreturn
+120  common	clone			sys_clone			compat_sys_clone
+121  common	setdomainname		sys_setdomainname		compat_sys_setdomainname
+122  common	uname			sys_newuname			compat_sys_newuname
+124  common	adjtimex		sys_adjtimex			compat_sys_adjtimex
+125  common	mprotect		sys_mprotect			compat_sys_mprotect
+126  common	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127  common	create_module		-				-
+128  common	init_module		sys_init_module			compat_sys_init_module
+129  common	delete_module		sys_delete_module		compat_sys_delete_module
+130  common	get_kernel_syms		-				-
+131  common	quotactl		sys_quotactl			compat_sys_quotactl
+132  common	getpgid			sys_getpgid			sys_getpgid
+133  common	fchdir			sys_fchdir			sys_fchdir
+134  common	bdflush			sys_bdflush			compat_sys_bdflush
+135  common	sysfs			sys_sysfs			compat_sys_sysfs
+136  common	personality		sys_s390_personality		sys_s390_personality
+137  common	afs_syscall		-				-
+138  32		setfsuid		-				compat_sys_s390_setfsuid16
+139  32		setfsgid		-				compat_sys_s390_setfsgid16
+140  32		_llseek			-				compat_sys_llseek
+141  common	getdents		sys_getdents			compat_sys_getdents
+142  32		_newselect		-				compat_sys_select
+142  64		select			sys_select			-
+143  common	flock			sys_flock			sys_flock
+144  common	msync			sys_msync			compat_sys_msync
+145  common	readv			sys_readv			compat_sys_readv
+146  common	writev			sys_writev			compat_sys_writev
+147  common	getsid			sys_getsid			sys_getsid
+148  common	fdatasync		sys_fdatasync			sys_fdatasync
+149  common	_sysctl			sys_sysctl			compat_sys_sysctl
+150  common	mlock			sys_mlock			compat_sys_mlock
+151  common	munlock			sys_munlock			compat_sys_munlock
+152  common	mlockall		sys_mlockall			sys_mlockall
+153  common	munlockall		sys_munlockall			sys_munlockall
+154  common	sched_setparam		sys_sched_setparam		compat_sys_sched_setparam
+155  common	sched_getparam		sys_sched_getparam		compat_sys_sched_getparam
+156  common	sched_setscheduler	sys_sched_setscheduler		compat_sys_sched_setscheduler
+157  common	sched_getscheduler	sys_sched_getscheduler		sys_sched_getscheduler
+158  common	sched_yield		sys_sched_yield			sys_sched_yield
+159  common	sched_get_priority_max	sys_sched_get_priority_max	sys_sched_get_priority_max
+160  common	sched_get_priority_min	sys_sched_get_priority_min	sys_sched_get_priority_min
+161  common	sched_rr_get_interval	sys_sched_rr_get_interval	compat_sys_sched_rr_get_interval
+162  common	nanosleep		sys_nanosleep			compat_sys_nanosleep
+163  common	mremap			sys_mremap			compat_sys_mremap
+164  32		setresuid		-				compat_sys_s390_setresuid16
+165  32		getresuid		-				compat_sys_s390_getresuid16
+167  common	query_module		-				-
+168  common	poll			sys_poll			compat_sys_poll
+169  common	nfsservctl		-				-
+170  32		setresgid		-				compat_sys_s390_setresgid16
+171  32		getresgid		-				compat_sys_s390_getresgid16
+172  common	prctl			sys_prctl			compat_sys_prctl
+173  common	rt_sigreturn		sys_rt_sigreturn		compat_sys_rt_sigreturn
+174  common	rt_sigaction		sys_rt_sigaction		compat_sys_rt_sigaction
+175  common	rt_sigprocmask		sys_rt_sigprocmask		compat_sys_rt_sigprocmask
+176  common	rt_sigpending		sys_rt_sigpending		compat_sys_rt_sigpending
+177  common	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+178  common	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
+179  common	rt_sigsuspend		sys_rt_sigsuspend		compat_sys_rt_sigsuspend
+180  common	pread64			sys_pread64			compat_sys_s390_pread64
+181  common	pwrite64		sys_pwrite64			compat_sys_s390_pwrite64
+182  32		chown			-				compat_sys_s390_chown16
+183  common	getcwd			sys_getcwd			compat_sys_getcwd
+184  common	capget			sys_capget			compat_sys_capget
+185  common	capset			sys_capset			compat_sys_capset
+186  common	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
+187  common	sendfile		sys_sendfile64			compat_sys_sendfile
+188  common	getpmsg			-				-
+189  common	putpmsg			-				-
+190  common	vfork			sys_vfork			sys_vfork
+191  32		ugetrlimit		-				compat_sys_getrlimit
+191  64		getrlimit		sys_getrlimit			-
+192  32		mmap2			-				compat_sys_s390_mmap2
+193  32		truncate64		-				compat_sys_s390_truncate64
+194  32		ftruncate64		-				compat_sys_s390_ftruncate64
+195  32		stat64			-				compat_sys_s390_stat64
+196  32		lstat64			-				compat_sys_s390_lstat64
+197  32		fstat64			-				compat_sys_s390_fstat64
+198  32		lchown32		-				compat_sys_lchown
+198  64		lchown			sys_lchown			-
+199  32		getuid32		-				sys_getuid
+199  64		getuid			sys_getuid			-
+200  32		getgid32		-				sys_getgid
+200  64		getgid			sys_getgid			-
+201  32		geteuid32		-				sys_geteuid
+201  64		geteuid			sys_geteuid			-
+202  32		getegid32		-				sys_getegid
+202  64		getegid			sys_getegid			-
+203  32		setreuid32		-				sys_setreuid
+203  64		setreuid		sys_setreuid			-
+204  32		setregid32		-				sys_setregid
+204  64		setregid		sys_setregid			-
+205  32		getgroups32		-				compat_sys_getgroups
+205  64		getgroups		sys_getgroups			-
+206  32		setgroups32		-				compat_sys_setgroups
+206  64		setgroups		sys_setgroups			-
+207  32		fchown32		-				sys_fchown
+207  64		fchown			sys_fchown			-
+208  32		setresuid32		-				sys_setresuid
+208  64		setresuid		sys_setresuid			-
+209  32		getresuid32		-				compat_sys_getresuid
+209  64		getresuid		sys_getresuid			-
+210  32		setresgid32		-				sys_setresgid
+210  64		setresgid		sys_setresgid			-
+211  32		getresgid32		-				compat_sys_getresgid
+211  64		getresgid		sys_getresgid			-
+212  32		chown32			-				compat_sys_chown
+212  64		chown			sys_chown			-
+213  32		setuid32		-				sys_setuid
+213  64		setuid			sys_setuid			-
+214  32		setgid32		-				sys_setgid
+214  64		setgid			sys_setgid			-
+215  32		setfsuid32		-				sys_setfsuid
+215  64		setfsuid		sys_setfsuid			-
+216  32		setfsgid32		-				sys_setfsgid
+216  64		setfsgid		sys_setfsgid			-
+217  common	pivot_root		sys_pivot_root			compat_sys_pivot_root
+218  common	mincore			sys_mincore			compat_sys_mincore
+219  common	madvise			sys_madvise			compat_sys_madvise
+220  common	getdents64		sys_getdents64			compat_sys_getdents64
+221  32		fcntl64			-				compat_sys_fcntl64
+222  common	readahead		sys_readahead			compat_sys_s390_readahead
+223  32		sendfile64		-				compat_sys_sendfile64
+224  common	setxattr		sys_setxattr			compat_sys_setxattr
+225  common	lsetxattr		sys_lsetxattr			compat_sys_lsetxattr
+226  common	fsetxattr		sys_fsetxattr			compat_sys_fsetxattr
+227  common	getxattr		sys_getxattr			compat_sys_getxattr
+228  common	lgetxattr		sys_lgetxattr			compat_sys_lgetxattr
+229  common	fgetxattr		sys_fgetxattr			compat_sys_fgetxattr
+230  common	listxattr		sys_listxattr			compat_sys_listxattr
+231  common	llistxattr		sys_llistxattr			compat_sys_llistxattr
+232  common	flistxattr		sys_flistxattr			compat_sys_flistxattr
+233  common	removexattr		sys_removexattr			compat_sys_removexattr
+234  common	lremovexattr		sys_lremovexattr		compat_sys_lremovexattr
+235  common	fremovexattr		sys_fremovexattr		compat_sys_fremovexattr
+236  common	gettid			sys_gettid			sys_gettid
+237  common	tkill			sys_tkill			sys_tkill
+238  common	futex			sys_futex			compat_sys_futex
+239  common	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+240  common	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+241  common	tgkill			sys_tgkill			sys_tgkill
+243  common	io_setup		sys_io_setup			compat_sys_io_setup
+244  common	io_destroy		sys_io_destroy			compat_sys_io_destroy
+245  common	io_getevents		sys_io_getevents		compat_sys_io_getevents
+246  common	io_submit		sys_io_submit			compat_sys_io_submit
+247  common	io_cancel		sys_io_cancel			compat_sys_io_cancel
+248  common	exit_group		sys_exit_group			sys_exit_group
+249  common	epoll_create		sys_epoll_create		sys_epoll_create
+250  common	epoll_ctl		sys_epoll_ctl			compat_sys_epoll_ctl
+251  common	epoll_wait		sys_epoll_wait			compat_sys_epoll_wait
+252  common	set_tid_address		sys_set_tid_address		compat_sys_set_tid_address
+253  common	fadvise64		sys_fadvise64_64		compat_sys_s390_fadvise64
+254  common	timer_create		sys_timer_create		compat_sys_timer_create
+255  common	timer_settime		sys_timer_settime		compat_sys_timer_settime
+256  common	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+257  common	timer_getoverrun	sys_timer_getoverrun		sys_timer_getoverrun
+258  common	timer_delete		sys_timer_delete		sys_timer_delete
+259  common	clock_settime		sys_clock_settime		compat_sys_clock_settime
+260  common	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
+261  common	clock_getres		sys_clock_getres		compat_sys_clock_getres
+262  common	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+264  32		fadvise64_64		-				compat_sys_s390_fadvise64_64
+265  common	statfs64		sys_statfs64			compat_sys_statfs64
+266  common	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+267  common	remap_file_pages	sys_remap_file_pages		compat_sys_remap_file_pages
+268  common	mbind			sys_mbind			compat_sys_mbind
+269  common	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
+270  common	set_mempolicy		sys_set_mempolicy		compat_sys_set_mempolicy
+271  common	mq_open			sys_mq_open			compat_sys_mq_open
+272  common	mq_unlink		sys_mq_unlink			compat_sys_mq_unlink
+273  common	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
+274  common	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+275  common	mq_notify		sys_mq_notify			compat_sys_mq_notify
+276  common	mq_getsetattr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+277  common	kexec_load		sys_kexec_load			compat_sys_kexec_load
+278  common	add_key			sys_add_key			compat_sys_add_key
+279  common	request_key		sys_request_key			compat_sys_request_key
+280  common	keyctl			sys_keyctl			compat_sys_keyctl
+281  common	waitid			sys_waitid			compat_sys_waitid
+282  common	ioprio_set		sys_ioprio_set			sys_ioprio_set
+283  common	ioprio_get		sys_ioprio_get			sys_ioprio_get
+284  common	inotify_init		sys_inotify_init		sys_inotify_init
+285  common	inotify_add_watch	sys_inotify_add_watch		compat_sys_inotify_add_watch
+286  common	inotify_rm_watch	sys_inotify_rm_watch		sys_inotify_rm_watch
+287  common	migrate_pages		sys_migrate_pages		compat_sys_migrate_pages
+288  common	openat			sys_openat			compat_sys_openat
+289  common	mkdirat			sys_mkdirat			compat_sys_mkdirat
+290  common	mknodat			sys_mknodat			compat_sys_mknodat
+291  common	fchownat		sys_fchownat			compat_sys_fchownat
+292  common	futimesat		sys_futimesat			compat_sys_futimesat
+293  32		fstatat64		-				compat_sys_s390_fstatat64
+293  64		newfstatat		sys_newfstatat			-
+294  common	unlinkat		sys_unlinkat			compat_sys_unlinkat
+295  common	renameat		sys_renameat			compat_sys_renameat
+296  common	linkat			sys_linkat			compat_sys_linkat
+297  common	symlinkat		sys_symlinkat			compat_sys_symlinkat
+298  common	readlinkat		sys_readlinkat			compat_sys_readlinkat
+299  common	fchmodat		sys_fchmodat			compat_sys_fchmodat
+300  common	faccessat		sys_faccessat			compat_sys_faccessat
+301  common	pselect6		sys_pselect6			compat_sys_pselect6
+302  common	ppoll			sys_ppoll			compat_sys_ppoll
+303  common	unshare			sys_unshare			compat_sys_unshare
+304  common	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+305  common	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+306  common	splice			sys_splice			compat_sys_splice
+307  common	sync_file_range		sys_sync_file_range		compat_sys_s390_sync_file_range
+308  common	tee			sys_tee				compat_sys_tee
+309  common	vmsplice		sys_vmsplice			compat_sys_vmsplice
+310  common	move_pages		sys_move_pages			compat_sys_move_pages
+311  common	getcpu			sys_getcpu			compat_sys_getcpu
+312  common	epoll_pwait		sys_epoll_pwait			compat_sys_epoll_pwait
+313  common	utimes			sys_utimes			compat_sys_utimes
+314  common	fallocate		sys_fallocate			compat_sys_s390_fallocate
+315  common	utimensat		sys_utimensat			compat_sys_utimensat
+316  common	signalfd		sys_signalfd			compat_sys_signalfd
+317  common	timerfd			-				-
+318  common	eventfd			sys_eventfd			sys_eventfd
+319  common	timerfd_create		sys_timerfd_create		sys_timerfd_create
+320  common	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
+321  common	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+322  common	signalfd4		sys_signalfd4			compat_sys_signalfd4
+323  common	eventfd2		sys_eventfd2			sys_eventfd2
+324  common	inotify_init1		sys_inotify_init1		sys_inotify_init1
+325  common	pipe2			sys_pipe2			compat_sys_pipe2
+326  common	dup3			sys_dup3			sys_dup3
+327  common	epoll_create1		sys_epoll_create1		sys_epoll_create1
+328  common	preadv			sys_preadv			compat_sys_preadv
+329  common	pwritev			sys_pwritev			compat_sys_pwritev
+330  common	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+331  common	perf_event_open		sys_perf_event_open		compat_sys_perf_event_open
+332  common	fanotify_init		sys_fanotify_init		sys_fanotify_init
+333  common	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
+334  common	prlimit64		sys_prlimit64			compat_sys_prlimit64
+335  common	name_to_handle_at	sys_name_to_handle_at		compat_sys_name_to_handle_at
+336  common	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+337  common	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+338  common	syncfs			sys_syncfs			sys_syncfs
+339  common	setns			sys_setns			sys_setns
+340  common	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
+341  common	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
+342  common	s390_runtime_instr	sys_s390_runtime_instr		sys_s390_runtime_instr
+343  common	kcmp			sys_kcmp			compat_sys_kcmp
+344  common	finit_module		sys_finit_module		compat_sys_finit_module
+345  common	sched_setattr		sys_sched_setattr		compat_sys_sched_setattr
+346  common	sched_getattr		sys_sched_getattr		compat_sys_sched_getattr
+347  common	renameat2		sys_renameat2			compat_sys_renameat2
+348  common	seccomp			sys_seccomp			compat_sys_seccomp
+349  common	getrandom		sys_getrandom			compat_sys_getrandom
+350  common	memfd_create		sys_memfd_create		compat_sys_memfd_create
+351  common	bpf			sys_bpf				compat_sys_bpf
+352  common	s390_pci_mmio_write	sys_s390_pci_mmio_write		compat_sys_s390_pci_mmio_write
+353  common	s390_pci_mmio_read	sys_s390_pci_mmio_read		compat_sys_s390_pci_mmio_read
+354  common	execveat		sys_execveat			compat_sys_execveat
+355  common	userfaultfd		sys_userfaultfd			sys_userfaultfd
+356  common	membarrier		sys_membarrier			sys_membarrier
+357  common	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+358  common	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+359  common	socket			sys_socket			sys_socket
+360  common	socketpair		sys_socketpair			compat_sys_socketpair
+361  common	bind			sys_bind			compat_sys_bind
+362  common	connect			sys_connect			compat_sys_connect
+363  common	listen			sys_listen			sys_listen
+364  common	accept4			sys_accept4			compat_sys_accept4
+365  common	getsockopt		sys_getsockopt			compat_sys_getsockopt
+366  common	setsockopt		sys_setsockopt			compat_sys_setsockopt
+367  common	getsockname		sys_getsockname			compat_sys_getsockname
+368  common	getpeername		sys_getpeername			compat_sys_getpeername
+369  common	sendto			sys_sendto			compat_sys_sendto
+370  common	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371  common	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372  common	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373  common	shutdown		sys_shutdown			sys_shutdown
+374  common	mlock2			sys_mlock2			compat_sys_mlock2
+375  common	copy_file_range		sys_copy_file_range		compat_sys_copy_file_range
+376  common	preadv2			sys_preadv2			compat_sys_preadv2
+377  common	pwritev2		sys_pwritev2			compat_sys_pwritev2
+378  common	s390_guarded_storage	sys_s390_guarded_storage	compat_sys_s390_guarded_storage
+379  common	statx			sys_statx			compat_sys_statx
+380  common	s390_sthyi		sys_s390_sthyi			compat_sys_s390_sthyi

From 690d22d9d4423b4522fb44a71145403eef2df834 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Thu, 8 Feb 2018 12:47:50 +0100
Subject: [PATCH 648/676] perf s390: Rework system call table creation by using
 syscall.tbl

Recently, s390 uses a syscall.tbl input file to generate its system call
table and unistd uapi header files.  Hence, update mksyscalltbl to use
it as input to create the system table for perf.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: linux-s390@vger.kernel.org
LPU-Reference: 1518090470-2899-4-git-send-email-brueckner@linux.vnet.ibm.com
Link: https://lkml.kernel.org/n/tip-bdyhllhsq1zgxv2qx4m377y6@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/s390/Makefile                  | 10 +++++++---
 .../perf/arch/s390/entry/syscalls/mksyscalltbl | 18 +++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
index 48228de415d00..dfa6e31034371 100644
--- a/tools/perf/arch/s390/Makefile
+++ b/tools/perf/arch/s390/Makefile
@@ -10,15 +10,19 @@ PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
 
 out    := $(OUTPUT)arch/s390/include/generated/asm
 header := $(out)/syscalls_64.c
-sysdef := $(srctree)/tools/arch/s390/include/uapi/asm/unistd.h
-sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls/
+syskrn := $(srctree)/arch/s390/kernel/syscalls/syscall.tbl
+sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls
+sysdef := $(sysprf)/syscall.tbl
 systbl := $(sysprf)/mksyscalltbl
 
 # Create output directory if not already present
 _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
 
 $(header): $(sysdef) $(systbl)
-	$(Q)$(SHELL) '$(systbl)' '$(CC)' $(sysdef) > $@
+	@(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \
+        (diff -B $(sysdef) $(syskrn) >/dev/null) \
+        || echo "Warning: Kernel ABI header at '$(sysdef)' differs from latest version at '$(syskrn)'" >&2 )) || true
+	$(Q)$(SHELL) '$(systbl)' $(sysdef) > $@
 
 clean::
 	$(call QUIET_CLEAN, s390) $(RM) $(header)
diff --git a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl
index 7fa0d0abd4196..72ecbb6763707 100755
--- a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl
+++ b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl
@@ -3,25 +3,23 @@
 #
 # Generate system call table for perf
 #
-#
-# Copyright IBM Corp. 2017
+# Copyright IBM Corp. 2017, 2018
 # Author(s):  Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 #
 
-gcc=$1
-input=$2
+SYSCALL_TBL=$1
 
-if ! test -r $input; then
+if ! test -r $SYSCALL_TBL; then
 	echo "Could not read input file" >&2
 	exit 1
 fi
 
 create_table()
 {
-	local max_nr
+	local max_nr nr abi sc discard
 
 	echo 'static const char *syscalltbl_s390_64[] = {'
-	while read sc nr; do
+	while read nr abi sc discard; do
 		printf '\t[%d] = "%s",\n' $nr $sc
 		max_nr=$nr
 	done
@@ -29,8 +27,6 @@ create_table()
 	echo "#define SYSCALLTBL_S390_64_MAX_ID $max_nr"
 }
 
-
-$gcc -m64 -E -dM -x c  $input	       \
-	|sed -ne 's/^#define __NR_//p' \
-	|sort -t' ' -k2 -nu	       \
+grep -E "^[[:digit:]]+[[:space:]]+(common|64)" $SYSCALL_TBL	\
+	|sort -k1 -n					\
 	|create_table

From f1d0b4cde922863004ce3f5f39e8662cc0686c96 Mon Sep 17 00:00:00 2001
From: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Date: Thu, 8 Feb 2018 12:47:48 +0100
Subject: [PATCH 649/676] Revert "tools include s390: Grab a copy of
 arch/s390/include/uapi/asm/unistd.h"

This reverts commit f120c7b187e6c418238710b48723ce141f467543 which is no
longer required with the introduction of a syscall.tbl on s390.

Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Michael Petlan <mpetlan@redhat.com>
Cc: Thomas Richter <tmricht@linux.vnet.ibm.com>
Cc: linux-s390@vger.kernel.org
LPU-Reference: 1518090470-2899-2-git-send-email-brueckner@linux.vnet.ibm.com
Link: https://lkml.kernel.org/n/tip-q1lg0nvhha1tk39ri9aqalcb@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/s390/include/uapi/asm/unistd.h | 412 ----------------------
 tools/perf/check-headers.sh               |   1 -
 2 files changed, 413 deletions(-)
 delete mode 100644 tools/arch/s390/include/uapi/asm/unistd.h

diff --git a/tools/arch/s390/include/uapi/asm/unistd.h b/tools/arch/s390/include/uapi/asm/unistd.h
deleted file mode 100644
index 725120939051f..0000000000000
--- a/tools/arch/s390/include/uapi/asm/unistd.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *  S390 version
- *
- *  Derived from "include/asm-i386/unistd.h"
- */
-
-#ifndef _UAPI_ASM_S390_UNISTD_H_
-#define _UAPI_ASM_S390_UNISTD_H_
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_exit                 1
-#define __NR_fork                 2
-#define __NR_read                 3
-#define __NR_write                4
-#define __NR_open                 5
-#define __NR_close                6
-#define __NR_restart_syscall	  7
-#define __NR_creat                8
-#define __NR_link                 9
-#define __NR_unlink              10
-#define __NR_execve              11
-#define __NR_chdir               12
-#define __NR_mknod               14
-#define __NR_chmod               15
-#define __NR_lseek               19
-#define __NR_getpid              20
-#define __NR_mount               21
-#define __NR_umount              22
-#define __NR_ptrace              26
-#define __NR_alarm               27
-#define __NR_pause               29
-#define __NR_utime               30
-#define __NR_access              33
-#define __NR_nice                34
-#define __NR_sync                36
-#define __NR_kill                37
-#define __NR_rename              38
-#define __NR_mkdir               39
-#define __NR_rmdir               40
-#define __NR_dup                 41
-#define __NR_pipe                42
-#define __NR_times               43
-#define __NR_brk                 45
-#define __NR_signal              48
-#define __NR_acct                51
-#define __NR_umount2             52
-#define __NR_ioctl               54
-#define __NR_fcntl               55
-#define __NR_setpgid             57
-#define __NR_umask               60
-#define __NR_chroot              61
-#define __NR_ustat               62
-#define __NR_dup2                63
-#define __NR_getppid             64
-#define __NR_getpgrp             65
-#define __NR_setsid              66
-#define __NR_sigaction           67
-#define __NR_sigsuspend          72
-#define __NR_sigpending          73
-#define __NR_sethostname         74
-#define __NR_setrlimit           75
-#define __NR_getrusage           77
-#define __NR_gettimeofday        78
-#define __NR_settimeofday        79
-#define __NR_symlink             83
-#define __NR_readlink            85
-#define __NR_uselib              86
-#define __NR_swapon              87
-#define __NR_reboot              88
-#define __NR_readdir             89
-#define __NR_mmap                90
-#define __NR_munmap              91
-#define __NR_truncate            92
-#define __NR_ftruncate           93
-#define __NR_fchmod              94
-#define __NR_getpriority         96
-#define __NR_setpriority         97
-#define __NR_statfs              99
-#define __NR_fstatfs            100
-#define __NR_socketcall         102
-#define __NR_syslog             103
-#define __NR_setitimer          104
-#define __NR_getitimer          105
-#define __NR_stat               106
-#define __NR_lstat              107
-#define __NR_fstat              108
-#define __NR_lookup_dcookie     110
-#define __NR_vhangup            111
-#define __NR_idle               112
-#define __NR_wait4              114
-#define __NR_swapoff            115
-#define __NR_sysinfo            116
-#define __NR_ipc                117
-#define __NR_fsync              118
-#define __NR_sigreturn          119
-#define __NR_clone              120
-#define __NR_setdomainname      121
-#define __NR_uname              122
-#define __NR_adjtimex           124
-#define __NR_mprotect           125
-#define __NR_sigprocmask        126
-#define __NR_create_module      127
-#define __NR_init_module        128
-#define __NR_delete_module      129
-#define __NR_get_kernel_syms    130
-#define __NR_quotactl           131
-#define __NR_getpgid            132
-#define __NR_fchdir             133
-#define __NR_bdflush            134
-#define __NR_sysfs              135
-#define __NR_personality        136
-#define __NR_afs_syscall        137 /* Syscall for Andrew File System */
-#define __NR_getdents           141
-#define __NR_flock              143
-#define __NR_msync              144
-#define __NR_readv              145
-#define __NR_writev             146
-#define __NR_getsid             147
-#define __NR_fdatasync          148
-#define __NR__sysctl            149
-#define __NR_mlock              150
-#define __NR_munlock            151
-#define __NR_mlockall           152
-#define __NR_munlockall         153
-#define __NR_sched_setparam             154
-#define __NR_sched_getparam             155
-#define __NR_sched_setscheduler         156
-#define __NR_sched_getscheduler         157
-#define __NR_sched_yield                158
-#define __NR_sched_get_priority_max     159
-#define __NR_sched_get_priority_min     160
-#define __NR_sched_rr_get_interval      161
-#define __NR_nanosleep          162
-#define __NR_mremap             163
-#define __NR_query_module       167
-#define __NR_poll               168
-#define __NR_nfsservctl         169
-#define __NR_prctl              172
-#define __NR_rt_sigreturn       173
-#define __NR_rt_sigaction       174
-#define __NR_rt_sigprocmask     175
-#define __NR_rt_sigpending      176
-#define __NR_rt_sigtimedwait    177
-#define __NR_rt_sigqueueinfo    178
-#define __NR_rt_sigsuspend      179
-#define __NR_pread64            180
-#define __NR_pwrite64           181
-#define __NR_getcwd             183
-#define __NR_capget             184
-#define __NR_capset             185
-#define __NR_sigaltstack        186
-#define __NR_sendfile           187
-#define __NR_getpmsg		188
-#define __NR_putpmsg		189
-#define __NR_vfork		190
-#define __NR_pivot_root         217
-#define __NR_mincore            218
-#define __NR_madvise            219
-#define __NR_getdents64		220
-#define __NR_readahead		222
-#define __NR_setxattr		224
-#define __NR_lsetxattr		225
-#define __NR_fsetxattr		226
-#define __NR_getxattr		227
-#define __NR_lgetxattr		228
-#define __NR_fgetxattr		229
-#define __NR_listxattr		230
-#define __NR_llistxattr		231
-#define __NR_flistxattr		232
-#define __NR_removexattr	233
-#define __NR_lremovexattr	234
-#define __NR_fremovexattr	235
-#define __NR_gettid		236
-#define __NR_tkill		237
-#define __NR_futex		238
-#define __NR_sched_setaffinity	239
-#define __NR_sched_getaffinity	240
-#define __NR_tgkill		241
-/* Number 242 is reserved for tux */
-#define __NR_io_setup		243
-#define __NR_io_destroy		244
-#define __NR_io_getevents	245
-#define __NR_io_submit		246
-#define __NR_io_cancel		247
-#define __NR_exit_group		248
-#define __NR_epoll_create	249
-#define __NR_epoll_ctl		250
-#define __NR_epoll_wait		251
-#define __NR_set_tid_address	252
-#define __NR_fadvise64		253
-#define __NR_timer_create	254
-#define __NR_timer_settime	255
-#define __NR_timer_gettime	256
-#define __NR_timer_getoverrun	257
-#define __NR_timer_delete	258
-#define __NR_clock_settime	259
-#define __NR_clock_gettime	260
-#define __NR_clock_getres	261
-#define __NR_clock_nanosleep	262
-/* Number 263 is reserved for vserver */
-#define __NR_statfs64		265
-#define __NR_fstatfs64		266
-#define __NR_remap_file_pages	267
-#define __NR_mbind		268
-#define __NR_get_mempolicy	269
-#define __NR_set_mempolicy	270
-#define __NR_mq_open		271
-#define __NR_mq_unlink		272
-#define __NR_mq_timedsend	273
-#define __NR_mq_timedreceive	274
-#define __NR_mq_notify		275
-#define __NR_mq_getsetattr	276
-#define __NR_kexec_load		277
-#define __NR_add_key		278
-#define __NR_request_key	279
-#define __NR_keyctl		280
-#define __NR_waitid		281
-#define __NR_ioprio_set		282
-#define __NR_ioprio_get		283
-#define __NR_inotify_init	284
-#define __NR_inotify_add_watch	285
-#define __NR_inotify_rm_watch	286
-#define __NR_migrate_pages	287
-#define __NR_openat		288
-#define __NR_mkdirat		289
-#define __NR_mknodat		290
-#define __NR_fchownat		291
-#define __NR_futimesat		292
-#define __NR_unlinkat		294
-#define __NR_renameat		295
-#define __NR_linkat		296
-#define __NR_symlinkat		297
-#define __NR_readlinkat		298
-#define __NR_fchmodat		299
-#define __NR_faccessat		300
-#define __NR_pselect6		301
-#define __NR_ppoll		302
-#define __NR_unshare		303
-#define __NR_set_robust_list	304
-#define __NR_get_robust_list	305
-#define __NR_splice		306
-#define __NR_sync_file_range	307
-#define __NR_tee		308
-#define __NR_vmsplice		309
-#define __NR_move_pages		310
-#define __NR_getcpu		311
-#define __NR_epoll_pwait	312
-#define __NR_utimes		313
-#define __NR_fallocate		314
-#define __NR_utimensat		315
-#define __NR_signalfd		316
-#define __NR_timerfd		317
-#define __NR_eventfd		318
-#define __NR_timerfd_create	319
-#define __NR_timerfd_settime	320
-#define __NR_timerfd_gettime	321
-#define __NR_signalfd4		322
-#define __NR_eventfd2		323
-#define __NR_inotify_init1	324
-#define __NR_pipe2		325
-#define __NR_dup3		326
-#define __NR_epoll_create1	327
-#define	__NR_preadv		328
-#define	__NR_pwritev		329
-#define __NR_rt_tgsigqueueinfo	330
-#define __NR_perf_event_open	331
-#define __NR_fanotify_init	332
-#define __NR_fanotify_mark	333
-#define __NR_prlimit64		334
-#define __NR_name_to_handle_at	335
-#define __NR_open_by_handle_at	336
-#define __NR_clock_adjtime	337
-#define __NR_syncfs		338
-#define __NR_setns		339
-#define __NR_process_vm_readv	340
-#define __NR_process_vm_writev	341
-#define __NR_s390_runtime_instr 342
-#define __NR_kcmp		343
-#define __NR_finit_module	344
-#define __NR_sched_setattr	345
-#define __NR_sched_getattr	346
-#define __NR_renameat2		347
-#define __NR_seccomp		348
-#define __NR_getrandom		349
-#define __NR_memfd_create	350
-#define __NR_bpf		351
-#define __NR_s390_pci_mmio_write	352
-#define __NR_s390_pci_mmio_read		353
-#define __NR_execveat		354
-#define __NR_userfaultfd	355
-#define __NR_membarrier		356
-#define __NR_recvmmsg		357
-#define __NR_sendmmsg		358
-#define __NR_socket		359
-#define __NR_socketpair		360
-#define __NR_bind		361
-#define __NR_connect		362
-#define __NR_listen		363
-#define __NR_accept4		364
-#define __NR_getsockopt		365
-#define __NR_setsockopt		366
-#define __NR_getsockname	367
-#define __NR_getpeername	368
-#define __NR_sendto		369
-#define __NR_sendmsg		370
-#define __NR_recvfrom		371
-#define __NR_recvmsg		372
-#define __NR_shutdown		373
-#define __NR_mlock2		374
-#define __NR_copy_file_range	375
-#define __NR_preadv2		376
-#define __NR_pwritev2		377
-#define __NR_s390_guarded_storage	378
-#define __NR_statx		379
-#define __NR_s390_sthyi		380
-#define NR_syscalls 381
-
-/* 
- * There are some system calls that are not present on 64 bit, some
- * have a different name although they do the same (e.g. __NR_chown32
- * is __NR_chown on 64 bit).
- */
-#ifndef __s390x__
-
-#define __NR_time		 13
-#define __NR_lchown		 16
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-#define __NR_getrlimit		 76
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-#define __NR_fchown		 95
-#define __NR_ioperm		101
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR__newselect 	142
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-#define __NR_setresgid		170
-#define __NR_getresgid		171
-#define __NR_chown		182
-#define __NR_ugetrlimit		191	/* SuS compliant getrlimit */
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#define __NR_lchown32		198
-#define __NR_getuid32		199
-#define __NR_getgid32		200
-#define __NR_geteuid32		201
-#define __NR_getegid32		202
-#define __NR_setreuid32		203
-#define __NR_setregid32		204
-#define __NR_getgroups32	205
-#define __NR_setgroups32	206
-#define __NR_fchown32		207
-#define __NR_setresuid32	208
-#define __NR_getresuid32	209
-#define __NR_setresgid32	210
-#define __NR_getresgid32	211
-#define __NR_chown32		212
-#define __NR_setuid32		213
-#define __NR_setgid32		214
-#define __NR_setfsuid32		215
-#define __NR_setfsgid32		216
-#define __NR_fcntl64		221
-#define __NR_sendfile64		223
-#define __NR_fadvise64_64	264
-#define __NR_fstatat64		293
-
-#else
-
-#define __NR_select		142
-#define __NR_getrlimit		191	/* SuS compliant getrlimit */
-#define __NR_lchown  		198
-#define __NR_getuid  		199
-#define __NR_getgid  		200
-#define __NR_geteuid  		201
-#define __NR_getegid  		202
-#define __NR_setreuid  		203
-#define __NR_setregid  		204
-#define __NR_getgroups  	205
-#define __NR_setgroups  	206
-#define __NR_fchown  		207
-#define __NR_setresuid  	208
-#define __NR_getresuid  	209
-#define __NR_setresgid  	210
-#define __NR_getresgid  	211
-#define __NR_chown  		212
-#define __NR_setuid  		213
-#define __NR_setgid  		214
-#define __NR_setfsuid  		215
-#define __NR_setfsgid  		216
-#define __NR_newfstatat		293
-
-#endif
-
-#endif /* _UAPI_ASM_S390_UNISTD_H_ */
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 51abdb0a40474..790ec25919a0e 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -33,7 +33,6 @@ arch/s390/include/uapi/asm/kvm.h
 arch/s390/include/uapi/asm/kvm_perf.h
 arch/s390/include/uapi/asm/ptrace.h
 arch/s390/include/uapi/asm/sie.h
-arch/s390/include/uapi/asm/unistd.h
 arch/arm/include/uapi/asm/kvm.h
 arch/arm64/include/uapi/asm/kvm.h
 arch/alpha/include/uapi/asm/errno.h

From 096392e0714d3a520366ba467e215edf7280acff Mon Sep 17 00:00:00 2001
From: Minwoo Im <minwoo.im.dev@gmail.com>
Date: Thu, 15 Feb 2018 23:53:17 +0900
Subject: [PATCH 650/676] block: fix a typo in comment of
 BLK_MQ_POLL_STATS_BKTS

Update comment typo _consisitent_ to _consistent_ from following commit.
commit 0206319fdfee ("blk-mq: Fix poll_stat for new size-based bucketing.")

Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Minwoo Im <minwoo.im.dev@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4f3df807cf8f7..ed63f3b69c12b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -49,7 +49,7 @@ struct blk_stat_callback;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-/* Must be consisitent with blk_mq_poll_stats_bkt() */
+/* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
 
 /*

From ec897569ad7dbc6d595873a487c3fac23f463f76 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Wed, 31 Jan 2018 22:24:45 +0000
Subject: [PATCH 651/676] usb: Move USB_UHCI_BIG_ENDIAN_* out of USB_SUPPORT

Move the Kconfig symbols USB_UHCI_BIG_ENDIAN_MMIO and
USB_UHCI_BIG_ENDIAN_DESC out of drivers/usb/host/Kconfig, which is
conditional upon USB && USB_SUPPORT, so that it can be freely selected
by platform Kconfig symbols in architecture code.

For example once the MIPS_GENERIC platform selects are fixed in commit
2e6522c56552 ("MIPS: Fix typo BIG_ENDIAN to CPU_BIG_ENDIAN"), the MIPS
32r6_defconfig warns like so:

warning: (MIPS_GENERIC) selects USB_UHCI_BIG_ENDIAN_MMIO which has unmet direct dependencies (USB_SUPPORT && USB)
warning: (MIPS_GENERIC) selects USB_UHCI_BIG_ENDIAN_DESC which has unmet direct dependencies (USB_SUPPORT && USB)

Fixes: 2e6522c56552 ("MIPS: Fix typo BIG_ENDIAN to CPU_BIG_ENDIAN")
Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: Corentin Labbe <clabbe.montjoie@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: linux-usb@vger.kernel.org
Cc: linux-mips@linux-mips.org
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Patchwork: https://patchwork.linux-mips.org/patch/18559/
---
 drivers/usb/Kconfig      | 8 ++++++++
 drivers/usb/host/Kconfig | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index f699abab17875..65812a2f60b4e 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -19,6 +19,14 @@ config USB_EHCI_BIG_ENDIAN_MMIO
 config USB_EHCI_BIG_ENDIAN_DESC
 	bool
 
+config USB_UHCI_BIG_ENDIAN_MMIO
+	bool
+	default y if SPARC_LEON
+
+config USB_UHCI_BIG_ENDIAN_DESC
+	bool
+	default y if SPARC_LEON
+
 menuconfig USB_SUPPORT
 	bool "USB support"
 	depends on HAS_IOMEM
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 6150bed7cfa80..4fcfb3084b368 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -633,14 +633,6 @@ config USB_UHCI_ASPEED
        bool
        default y if ARCH_ASPEED
 
-config USB_UHCI_BIG_ENDIAN_MMIO
-	bool
-	default y if SPARC_LEON
-
-config USB_UHCI_BIG_ENDIAN_DESC
-	bool
-	default y if SPARC_LEON
-
 config USB_FHCI_HCD
 	tristate "Freescale QE USB Host Controller support"
 	depends on OF_GPIO && QE_GPIO && QUICC_ENGINE

From 5efad9eee33ee5fc4bf3059f74f3932a638534d1 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Wed, 31 Jan 2018 22:24:46 +0000
Subject: [PATCH 652/676] sparc,leon: Select USB_UHCI_BIG_ENDIAN_{MMIO,DESC}

Now that USB_UHCI_BIG_ENDIAN_MMIO and USB_UHCI_BIG_ENDIAN_DESC are moved
outside of the USB_SUPPORT conditional, simply select them from
SPARC_LEON rather than by the symbol's defaults in drivers/usb/Kconfig,
similar to how it is done for USB_EHCI_BIG_ENDIAN_MMIO and
USB_EHCI_BIG_ENDIAN_DESC.

Signed-off-by: James Hogan <jhogan@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Corentin Labbe <clabbe.montjoie@gmail.com>
Cc: sparclinux@vger.kernel.org
Cc: linux-usb@vger.kernel.org
Acked-by: David S. Miller <davem@davemloft.net>
Patchwork: https://patchwork.linux-mips.org/patch/18560/
---
 arch/sparc/Kconfig  | 2 ++
 drivers/usb/Kconfig | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 6bf594ace663e..8767e45f1b2b7 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -430,6 +430,8 @@ config SPARC_LEON
 	depends on SPARC32
 	select USB_EHCI_BIG_ENDIAN_MMIO
 	select USB_EHCI_BIG_ENDIAN_DESC
+	select USB_UHCI_BIG_ENDIAN_MMIO
+	select USB_UHCI_BIG_ENDIAN_DESC
 	---help---
 	  If you say Y here if you are running on a SPARC-LEON processor.
 	  The LEON processor is a synthesizable VHDL model of the
diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index 65812a2f60b4e..148f3ee702868 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -21,11 +21,9 @@ config USB_EHCI_BIG_ENDIAN_DESC
 
 config USB_UHCI_BIG_ENDIAN_MMIO
 	bool
-	default y if SPARC_LEON
 
 config USB_UHCI_BIG_ENDIAN_DESC
 	bool
-	default y if SPARC_LEON
 
 menuconfig USB_SUPPORT
 	bool "USB support"

From 92256269893e96e5f9e8ac6dd882a0bef63fcea7 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 7 Feb 2018 18:40:27 +0100
Subject: [PATCH 653/676] drm/nouveau: Make clock gate support conditional

The recently introduced clock gate support breaks on Tegra chips because
no thermal support is enabled for those devices. Conditionalize the code
on the existence of thermal support to fix this.

Fixes: b138eca661cc ("drm/nouveau: Add support for basic clockgating on Kepler1")
Cc: Martin Peres <martin.peres@free.fr>
Cc: Lyude Paul <lyude@redhat.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
---
 drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
index bf62303571b39..3695cde669f88 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c
@@ -301,7 +301,7 @@ nvkm_therm_attr_set(struct nvkm_therm *therm,
 void
 nvkm_therm_clkgate_enable(struct nvkm_therm *therm)
 {
-	if (!therm->func->clkgate_enable || !therm->clkgating_enabled)
+	if (!therm || !therm->func->clkgate_enable || !therm->clkgating_enabled)
 		return;
 
 	nvkm_debug(&therm->subdev,
@@ -312,7 +312,7 @@ nvkm_therm_clkgate_enable(struct nvkm_therm *therm)
 void
 nvkm_therm_clkgate_fini(struct nvkm_therm *therm, bool suspend)
 {
-	if (!therm->func->clkgate_fini || !therm->clkgating_enabled)
+	if (!therm || !therm->func->clkgate_fini || !therm->clkgating_enabled)
 		return;
 
 	nvkm_debug(&therm->subdev,
@@ -395,7 +395,7 @@ void
 nvkm_therm_clkgate_init(struct nvkm_therm *therm,
 			const struct nvkm_therm_clkgate_pack *p)
 {
-	if (!therm->func->clkgate_init || !therm->clkgating_enabled)
+	if (!therm || !therm->func->clkgate_init || !therm->clkgating_enabled)
 		return;
 
 	therm->func->clkgate_init(therm, p);

From 12310e3437554328bcd75186cf331bc712cb30b2 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@kernel.org>
Date: Wed, 10 Jan 2018 00:51:23 +0100
Subject: [PATCH 654/676] kprobes: Propagate error from arm_kprobe_ftrace()

Improve error handling when arming ftrace-based kprobes. Specifically, if
we fail to arm a ftrace-based kprobe, register_kprobe()/enable_kprobe()
should report an error instead of success. Previously, this has lead to
confusing situations where register_kprobe() would return 0 indicating
success, but the kprobe would not be functional if ftrace registration
during the kprobe arming process had failed. We should therefore take any
errors returned by ftrace into account and propagate this error so that we
do not register/enable kprobes that cannot be armed. This can happen if,
for example, register_ftrace_function() finds an IPMODIFY conflict (since
kprobe_ftrace_ops has this flag set) and returns an error. Such a conflict
is possible since livepatches also set the IPMODIFY flag for their ftrace_ops.

arm_all_kprobes() keeps its current behavior and attempts to arm all
kprobes. It returns the last encountered error and gives a warning if
not all probes could be armed.

This patch is based on Petr Mladek's original patchset (patches 2 and 3)
back in 2015, which improved kprobes error handling, found here:

   https://lkml.org/lkml/2015/2/26/452

However, further work on this had been paused since then and the patches
were not upstreamed.

Based-on-patches-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/20180109235124.30886-2-jeyu@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 100 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da2ccf1423581..2d988141ab859 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -978,18 +978,36 @@ static int prepare_kprobe(struct kprobe *p)
 }
 
 /* Caller must lock kprobe_mutex */
-static void arm_kprobe_ftrace(struct kprobe *p)
+static int arm_kprobe_ftrace(struct kprobe *p)
 {
-	int ret;
+	int ret = 0;
 
 	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
 				   (unsigned long)p->addr, 0, 0);
-	WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
-	kprobe_ftrace_enabled++;
-	if (kprobe_ftrace_enabled == 1) {
+	if (ret) {
+		pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+		return ret;
+	}
+
+	if (kprobe_ftrace_enabled == 0) {
 		ret = register_ftrace_function(&kprobe_ftrace_ops);
-		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+		if (ret) {
+			pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
+			goto err_ftrace;
+		}
 	}
+
+	kprobe_ftrace_enabled++;
+	return ret;
+
+err_ftrace:
+	/*
+	 * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
+	 * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
+	 * empty filter_hash which would undesirably trace all functions.
+	 */
+	ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
+	return ret;
 }
 
 /* Caller must lock kprobe_mutex */
@@ -1008,22 +1026,23 @@ static void disarm_kprobe_ftrace(struct kprobe *p)
 }
 #else	/* !CONFIG_KPROBES_ON_FTRACE */
 #define prepare_kprobe(p)	arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p)	do {} while (0)
+#define arm_kprobe_ftrace(p)	(-ENODEV)
 #define disarm_kprobe_ftrace(p)	do {} while (0)
 #endif
 
 /* Arm a kprobe with text_mutex */
-static void arm_kprobe(struct kprobe *kp)
+static int arm_kprobe(struct kprobe *kp)
 {
-	if (unlikely(kprobe_ftrace(kp))) {
-		arm_kprobe_ftrace(kp);
-		return;
-	}
+	if (unlikely(kprobe_ftrace(kp)))
+		return arm_kprobe_ftrace(kp);
+
 	cpus_read_lock();
 	mutex_lock(&text_mutex);
 	__arm_kprobe(kp);
 	mutex_unlock(&text_mutex);
 	cpus_read_unlock();
+
+	return 0;
 }
 
 /* Disarm a kprobe with text_mutex */
@@ -1362,9 +1381,15 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
 
 	if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
 		ap->flags &= ~KPROBE_FLAG_DISABLED;
-		if (!kprobes_all_disarmed)
+		if (!kprobes_all_disarmed) {
 			/* Arm the breakpoint again. */
-			arm_kprobe(ap);
+			ret = arm_kprobe(ap);
+			if (ret) {
+				ap->flags |= KPROBE_FLAG_DISABLED;
+				list_del_rcu(&p->list);
+				synchronize_sched();
+			}
+		}
 	}
 	return ret;
 }
@@ -1573,8 +1598,14 @@ int register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (!kprobes_all_disarmed && !kprobe_disabled(p))
-		arm_kprobe(p);
+	if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
+		ret = arm_kprobe(p);
+		if (ret) {
+			hlist_del_rcu(&p->hlist);
+			synchronize_sched();
+			goto out;
+		}
+	}
 
 	/* Try to optimize kprobe */
 	try_to_optimize_kprobe(p);
@@ -2116,7 +2147,9 @@ int enable_kprobe(struct kprobe *kp)
 
 	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
 		p->flags &= ~KPROBE_FLAG_DISABLED;
-		arm_kprobe(p);
+		ret = arm_kprobe(p);
+		if (ret)
+			p->flags |= KPROBE_FLAG_DISABLED;
 	}
 out:
 	mutex_unlock(&kprobe_mutex);
@@ -2407,11 +2440,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
 	.release        = seq_release,
 };
 
-static void arm_all_kprobes(void)
+static int arm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct kprobe *p;
-	unsigned int i;
+	unsigned int i, total = 0, errors = 0;
+	int err, ret = 0;
 
 	mutex_lock(&kprobe_mutex);
 
@@ -2428,16 +2462,28 @@ static void arm_all_kprobes(void)
 	/* Arming kprobes doesn't optimize kprobe itself */
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
-		hlist_for_each_entry_rcu(p, head, hlist)
-			if (!kprobe_disabled(p))
-				arm_kprobe(p);
+		/* Arm all kprobes on a best-effort basis */
+		hlist_for_each_entry_rcu(p, head, hlist) {
+			if (!kprobe_disabled(p)) {
+				err = arm_kprobe(p);
+				if (err)  {
+					errors++;
+					ret = err;
+				}
+				total++;
+			}
+		}
 	}
 
-	printk(KERN_INFO "Kprobes globally enabled\n");
+	if (errors)
+		pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n",
+			errors, total);
+	else
+		pr_info("Kprobes globally enabled\n");
 
 already_enabled:
 	mutex_unlock(&kprobe_mutex);
-	return;
+	return ret;
 }
 
 static void disarm_all_kprobes(void)
@@ -2494,6 +2540,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 {
 	char buf[32];
 	size_t buf_size;
+	int ret = 0;
 
 	buf_size = min(count, (sizeof(buf)-1));
 	if (copy_from_user(buf, user_buf, buf_size))
@@ -2504,7 +2551,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	case 'y':
 	case 'Y':
 	case '1':
-		arm_all_kprobes();
+		ret = arm_all_kprobes();
 		break;
 	case 'n':
 	case 'N':
@@ -2515,6 +2562,9 @@ static ssize_t write_enabled_file_bool(struct file *file,
 		return -EINVAL;
 	}
 
+	if (ret)
+		return ret;
+
 	return count;
 }
 

From 297f9233b53a08fd457815e19f1d6f2c3389857b Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@kernel.org>
Date: Wed, 10 Jan 2018 00:51:24 +0100
Subject: [PATCH 655/676] kprobes: Propagate error from disarm_kprobe_ftrace()

Improve error handling when disarming ftrace-based kprobes. Like with
arm_kprobe_ftrace(), propagate any errors from disarm_kprobe_ftrace() so
that we do not disable/unregister kprobes that are still armed. In other
words, unregister_kprobe() and disable_kprobe() should not report success
if the kprobe could not be disarmed.

disarm_all_kprobes() keeps its current behavior and attempts to
disarm all kprobes. It returns the last encountered error and gives a
warning if not all probes could be disarmed.

This patch is based on Petr Mladek's original patchset (patches 2 and 3)
back in 2015, which improved kprobes error handling, found here:

   https://lkml.org/lkml/2015/2/26/452

However, further work on this had been paused since then and the patches
were not upstreamed.

Based-on-patches-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/20180109235124.30886-3-jeyu@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 78 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2d988141ab859..102160ff5c661 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1011,23 +1011,27 @@ static int arm_kprobe_ftrace(struct kprobe *p)
 }
 
 /* Caller must lock kprobe_mutex */
-static void disarm_kprobe_ftrace(struct kprobe *p)
+static int disarm_kprobe_ftrace(struct kprobe *p)
 {
-	int ret;
+	int ret = 0;
 
-	kprobe_ftrace_enabled--;
-	if (kprobe_ftrace_enabled == 0) {
+	if (kprobe_ftrace_enabled == 1) {
 		ret = unregister_ftrace_function(&kprobe_ftrace_ops);
-		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+		if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
+			return ret;
 	}
+
+	kprobe_ftrace_enabled--;
+
 	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
 			   (unsigned long)p->addr, 1, 0);
 	WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+	return ret;
 }
 #else	/* !CONFIG_KPROBES_ON_FTRACE */
 #define prepare_kprobe(p)	arch_prepare_kprobe(p)
 #define arm_kprobe_ftrace(p)	(-ENODEV)
-#define disarm_kprobe_ftrace(p)	do {} while (0)
+#define disarm_kprobe_ftrace(p)	(-ENODEV)
 #endif
 
 /* Arm a kprobe with text_mutex */
@@ -1046,18 +1050,18 @@ static int arm_kprobe(struct kprobe *kp)
 }
 
 /* Disarm a kprobe with text_mutex */
-static void disarm_kprobe(struct kprobe *kp, bool reopt)
+static int disarm_kprobe(struct kprobe *kp, bool reopt)
 {
-	if (unlikely(kprobe_ftrace(kp))) {
-		disarm_kprobe_ftrace(kp);
-		return;
-	}
+	if (unlikely(kprobe_ftrace(kp)))
+		return disarm_kprobe_ftrace(kp);
 
 	cpus_read_lock();
 	mutex_lock(&text_mutex);
 	__disarm_kprobe(kp, reopt);
 	mutex_unlock(&text_mutex);
 	cpus_read_unlock();
+
+	return 0;
 }
 
 /*
@@ -1639,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
 static struct kprobe *__disable_kprobe(struct kprobe *p)
 {
 	struct kprobe *orig_p;
+	int ret;
 
 	/* Get an original kprobe for return */
 	orig_p = __get_valid_kprobe(p);
 	if (unlikely(orig_p == NULL))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	if (!kprobe_disabled(p)) {
 		/* Disable probe if it is a child probe */
@@ -1657,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p)
 			 * should have already been disarmed, so
 			 * skip unneed disarming process.
 			 */
-			if (!kprobes_all_disarmed)
-				disarm_kprobe(orig_p, true);
+			if (!kprobes_all_disarmed) {
+				ret = disarm_kprobe(orig_p, true);
+				if (ret) {
+					p->flags &= ~KPROBE_FLAG_DISABLED;
+					return ERR_PTR(ret);
+				}
+			}
 			orig_p->flags |= KPROBE_FLAG_DISABLED;
 		}
 	}
@@ -1675,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p)
 
 	/* Disable kprobe. This will disarm it if needed. */
 	ap = __disable_kprobe(p);
-	if (ap == NULL)
-		return -EINVAL;
+	if (IS_ERR(ap))
+		return PTR_ERR(ap);
 
 	if (ap == p)
 		/*
@@ -2109,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p)
 int disable_kprobe(struct kprobe *kp)
 {
 	int ret = 0;
+	struct kprobe *p;
 
 	mutex_lock(&kprobe_mutex);
 
 	/* Disable this kprobe */
-	if (__disable_kprobe(kp) == NULL)
-		ret = -EINVAL;
+	p = __disable_kprobe(kp);
+	if (IS_ERR(p))
+		ret = PTR_ERR(p);
 
 	mutex_unlock(&kprobe_mutex);
 	return ret;
@@ -2486,34 +2498,50 @@ static int arm_all_kprobes(void)
 	return ret;
 }
 
-static void disarm_all_kprobes(void)
+static int disarm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct kprobe *p;
-	unsigned int i;
+	unsigned int i, total = 0, errors = 0;
+	int err, ret = 0;
 
 	mutex_lock(&kprobe_mutex);
 
 	/* If kprobes are already disarmed, just return */
 	if (kprobes_all_disarmed) {
 		mutex_unlock(&kprobe_mutex);
-		return;
+		return 0;
 	}
 
 	kprobes_all_disarmed = true;
-	printk(KERN_INFO "Kprobes globally disabled\n");
 
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
+		/* Disarm all kprobes on a best-effort basis */
 		hlist_for_each_entry_rcu(p, head, hlist) {
-			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-				disarm_kprobe(p, false);
+			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
+				err = disarm_kprobe(p, false);
+				if (err) {
+					errors++;
+					ret = err;
+				}
+				total++;
+			}
 		}
 	}
+
+	if (errors)
+		pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n",
+			errors, total);
+	else
+		pr_info("Kprobes globally disabled\n");
+
 	mutex_unlock(&kprobe_mutex);
 
 	/* Wait for disarming all kprobes by optimizer */
 	wait_for_kprobe_optimizer();
+
+	return ret;
 }
 
 /*
@@ -2556,7 +2584,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	case 'n':
 	case 'N':
 	case '0':
-		disarm_all_kprobes();
+		ret = disarm_all_kprobes();
 		break;
 	default:
 		return -EINVAL;

From f960cfd12650fad43c1cde07a1f7642cf2c57f97 Mon Sep 17 00:00:00 2001
From: Matthew Whitehead <tedheadster@gmail.com>
Date: Thu, 15 Feb 2018 11:54:54 -0500
Subject: [PATCH 656/676] x86/Kconfig: Add missing i586-class CPUs to the
 X86_CMPXCHG64 Kconfig group

Several i586-class CPUs supporting this instruction are missing from
the X86_CMPXCHG64 config group.

Using a configuration with either M586TSC or M586MMX currently sets
X86_MINIMUM_CPU_FAMILY=4 instead of the correct value of 5.

Booting on an i486 it will fail to generate the "This kernel
requires an i586 CPU, but only detected an i486 CPU" message and
intentional halt as expected. It will instead just silently hang
when it hits i586-specific instructions.

The M586 CPU is not in this list because at least the Cyrix 5x86
lacks this instruction, and perhaps others.

Signed-off-by: Matthew Whitehead <tedheadster@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518713696-11360-1-git-send-email-tedheadster@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 65a9a4716e34f..ec64aa7287277 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -374,7 +374,7 @@ config X86_TSC
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.

From 69b8d3fcabdc81d9efd82b4a506c8279cbaba692 Mon Sep 17 00:00:00 2001
From: Matthew Whitehead <tedheadster@gmail.com>
Date: Thu, 15 Feb 2018 11:54:55 -0500
Subject: [PATCH 657/676] x86/Kconfig: Exclude i586-class CPUs lacking PAE
 support from the HIGHMEM64G Kconfig group

i586-class machines also lack support for Physical Address Extension (PAE),
so add them to the exclusion list.

Signed-off-by: Matthew Whitehead <tedheadster@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518713696-11360-2-git-send-email-tedheadster@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a528c14d45a52..c1236b187824e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1404,7 +1404,7 @@ config HIGHMEM4G
 
 config HIGHMEM64G
 	bool "64GB"
-	depends on !M486
+	depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
 	select X86_PAE
 	---help---
 	  Select this if you have a 32-bit processor and more than 4

From 25d76ac888216c369dea91768764728b83769799 Mon Sep 17 00:00:00 2001
From: Matthew Whitehead <tedheadster@gmail.com>
Date: Thu, 15 Feb 2018 11:54:56 -0500
Subject: [PATCH 658/676] x86/Kconfig: Explicitly enumerate i686-class CPUs in
 Kconfig

The X86_P6_NOP config class leaves out many i686-class CPUs. Instead,
explicitly enumerate all these CPUs.

Using a configuration with M686 currently sets X86_MINIMUM_CPU_FAMILY=5
instead of the correct value of 6.

Booting on an i586 it will fail to generate the "This kernel
requires an i686 CPU, but only detected an i586 CPU" message and
intentional halt as expected. It will instead just silently hang
when it hits i686-specific instructions.

Signed-off-by: Matthew Whitehead <tedheadster@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1518713696-11360-3-git-send-email-tedheadster@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index ec64aa7287277..8b8d2297d4867 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -385,7 +385,7 @@ config X86_CMOV
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
-	default "6" if X86_32 && X86_P6_NOP
+	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8)
 	default "5" if X86_32 && X86_CMPXCHG64
 	default "4"
 

From d207af2eab3f8668b95ad02b21930481c42806fd Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhkelley@outlook.com>
Date: Wed, 14 Feb 2018 02:54:03 +0000
Subject: [PATCH 659/676] cpumask: Make for_each_cpu_wrap() available on UP as
 well

for_each_cpu_wrap() was originally added in the #else half of a
large "#if NR_CPUS == 1" statement, but was omitted in the #if
half.  This patch adds the missing #if half to prevent compile
errors when NR_CPUS is 1.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Michael Kelley <mhkelley@outlook.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kys@microsoft.com
Cc: martin.petersen@oracle.com
Cc: mikelley@microsoft.com
Fixes: c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()")
Link: http://lkml.kernel.org/r/SN6PR1901MB2045F087F59450507D4FCC17CBF50@SN6PR1901MB2045.namprd19.prod.outlook.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpumask.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index d4a2a7dcd72d9..bf53d893ad02b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -170,6 +170,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node)
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
 #define for_each_cpu_not(cpu, mask)		\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
+#define for_each_cpu_wrap(cpu, mask, start)	\
+	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
 #define for_each_cpu_and(cpu, mask, and)	\
 	for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and)
 #else

From 2c10636a0b9c689450e85f9945583920f50337c9 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Thu, 15 Feb 2018 21:27:41 -0600
Subject: [PATCH 660/676] powerpc/pseries: Check for zero filled
 ibm,dynamic-memory property

Some versions of QEMU will produce an ibm,dynamic-reconfiguration-memory
node with a ibm,dynamic-memory property that is zero-filled. This
causes the drmem code to oops trying to parse this property.

The fix for this is to validate that the property does contain LMB
entries before trying to parse it and bail if the count is zero.

  Oops: Kernel access of bad area, sig: 11 [#1]
  DAR: 0000000000000010
  NIP read_drconf_v1_cell+0x54/0x9c
  LR  read_drconf_v1_cell+0x48/0x9c
  Call Trace:
    __param_initcall_debug+0x0/0x28 (unreliable)
    drmem_init+0x144/0x2f8
    do_one_initcall+0x64/0x1d0
    kernel_init_freeable+0x298/0x38c
    kernel_init+0x24/0x160
    ret_from_kernel_thread+0x5c/0xb4

The ibm,dynamic-reconfiguration-memory device tree property generated
that causes this:

  ibm,dynamic-reconfiguration-memory {
          ibm,lmb-size = <0x0 0x10000000>;
          ibm,memory-flags-mask = <0xff>;
          ibm,dynamic-memory = <0x0 0x0 0x0 0x0 0x0 0x0>;
          linux,phandle = <0x7e57eed8>;
          ibm,associativity-lookup-arrays = <0x1 0x4 0x0 0x0 0x0 0x0>;
          ibm,memory-preservation-time = <0x0>;
  };

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Reviewed-by: Cyril Bur <cyrilbur@gmail.com>
Tested-by: Daniel Black <daniel@linux.vnet.ibm.com>
[mpe: Trim oops report]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/drmem.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 1604110c42386..916844f99c64e 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -216,6 +216,8 @@ static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
 	u32 i, n_lmbs;
 
 	n_lmbs = of_read_number(prop++, 1);
+	if (n_lmbs == 0)
+		return;
 
 	for (i = 0; i < n_lmbs; i++) {
 		read_drconf_v1_cell(&lmb, &prop);
@@ -245,6 +247,8 @@ static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
 	u32 i, j, lmb_sets;
 
 	lmb_sets = of_read_number(prop++, 1);
+	if (lmb_sets == 0)
+		return;
 
 	for (i = 0; i < lmb_sets; i++) {
 		read_drconf_v2_cell(&dr_cell, &prop);
@@ -354,6 +358,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop)
 	struct drmem_lmb *lmb;
 
 	drmem_info->n_lmbs = of_read_number(prop++, 1);
+	if (drmem_info->n_lmbs == 0)
+		return;
 
 	drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb),
 				   GFP_KERNEL);
@@ -373,6 +379,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 	int lmb_index;
 
 	lmb_sets = of_read_number(prop++, 1);
+	if (lmb_sets == 0)
+		return;
 
 	/* first pass, calculate the number of LMBs */
 	p = prop;

From 285cb4f62319737e6538252cf1a67ce9da5cf3d5 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@mips.com>
Date: Mon, 5 Feb 2018 16:45:36 +0000
Subject: [PATCH 661/676] irqchip/mips-gic: Avoid spuriously handling masked
 interrupts

Commit 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading
GIC_SH_MASK*") removed the read of the hardware mask register when
handling shared interrupts, instead using the driver's shadow pcpu_masks
entry as the effective mask. Unfortunately this did not take account of
the write to pcpu_masks during gic_shared_irq_domain_map, which
effectively unmasks the interrupt early. If an interrupt is asserted,
gic_handle_shared_int decodes and processes the interrupt even though it
has not yet been unmasked via gic_unmask_irq, which also sets the
appropriate bit in pcpu_masks.

On the MIPS Boston board, when a console command line of
"console=ttyS0,115200n8r" is passed, the modem status IRQ is enabled in
the UART, which is immediately raised to the GIC. The interrupt has been
mapped, but no handler has yet been registered, nor is it expected to be
unmasked. However, the write to pcpu_masks in gic_shared_irq_domain_map
has effectively unmasked it, resulting in endless reports of:

[    5.058454] irq 13, desc: ffffffff80a7ad80, depth: 1, count: 0, unhandled: 0
[    5.062057] ->handle_irq():  ffffffff801b1838,
[    5.062175] handle_bad_irq+0x0/0x2c0

Where IRQ 13 is the UART interrupt.

To fix this, just remove the write to pcpu_masks in
gic_shared_irq_domain_map. The existing write in gic_unmask_irq is the
correct place for what is now the effective unmasking.

Cc: stable@vger.kernel.org
Fixes: 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*")
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Reviewed-by: Paul Burton <paul.burton@mips.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-mips-gic.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index ef92a4d2038ee..d32268cc1174c 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -424,8 +424,6 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	spin_lock_irqsave(&gic_lock, flags);
 	write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin);
 	write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu)));
-	gic_clear_pcpu_masks(intr);
-	set_bit(intr, per_cpu_ptr(pcpu_masks, cpu));
 	irq_data_update_effective_affinity(data, cpumask_of(cpu));
 	spin_unlock_irqrestore(&gic_lock, flags);
 

From b6dd4d83dc2f78cebc9a7e6e7e4bc2be4d29b94d Mon Sep 17 00:00:00 2001
From: Mark Salter <msalter@redhat.com>
Date: Fri, 2 Feb 2018 09:20:29 -0500
Subject: [PATCH 662/676] irqchip/gic-v3: Change pr_debug message to pr_devel

The pr_debug() in gic-v3 gic_send_sgi() can trigger a circular locking
warning:

 GICv3: CPU10: ICC_SGI1R_EL1 5000400
 ======================================================
 WARNING: possible circular locking dependency detected
 4.15.0+ #1 Tainted: G        W
 ------------------------------------------------------
 dynamic_debug01/1873 is trying to acquire lock:
  ((console_sem).lock){-...}, at: [<0000000099c891ec>] down_trylock+0x20/0x4c

 but task is already holding lock:
  (&rq->lock){-.-.}, at: [<00000000842e1587>] __task_rq_lock+0x54/0xdc

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #2 (&rq->lock){-.-.}:
        __lock_acquire+0x3b4/0x6e0
        lock_acquire+0xf4/0x2a8
        _raw_spin_lock+0x4c/0x60
        task_fork_fair+0x3c/0x148
        sched_fork+0x10c/0x214
        copy_process.isra.32.part.33+0x4e8/0x14f0
        _do_fork+0xe8/0x78c
        kernel_thread+0x48/0x54
        rest_init+0x34/0x2a4
        start_kernel+0x45c/0x488

 -> #1 (&p->pi_lock){-.-.}:
        __lock_acquire+0x3b4/0x6e0
        lock_acquire+0xf4/0x2a8
        _raw_spin_lock_irqsave+0x58/0x70
        try_to_wake_up+0x48/0x600
        wake_up_process+0x28/0x34
        __up.isra.0+0x60/0x6c
        up+0x60/0x68
        __up_console_sem+0x4c/0x7c
        console_unlock+0x328/0x634
        vprintk_emit+0x25c/0x390
        dev_vprintk_emit+0xc4/0x1fc
        dev_printk_emit+0x88/0xa8
        __dev_printk+0x58/0x9c
        _dev_info+0x84/0xa8
        usb_new_device+0x100/0x474
        hub_port_connect+0x280/0x92c
        hub_event+0x740/0xa84
        process_one_work+0x240/0x70c
        worker_thread+0x60/0x400
        kthread+0x110/0x13c
        ret_from_fork+0x10/0x18

 -> #0 ((console_sem).lock){-...}:
        validate_chain.isra.34+0x6e4/0xa20
        __lock_acquire+0x3b4/0x6e0
        lock_acquire+0xf4/0x2a8
        _raw_spin_lock_irqsave+0x58/0x70
        down_trylock+0x20/0x4c
        __down_trylock_console_sem+0x3c/0x9c
        console_trylock+0x20/0xb0
        vprintk_emit+0x254/0x390
        vprintk_default+0x58/0x90
        vprintk_func+0xbc/0x164
        printk+0x80/0xa0
        __dynamic_pr_debug+0x84/0xac
        gic_raise_softirq+0x184/0x18c
        smp_cross_call+0xac/0x218
        smp_send_reschedule+0x3c/0x48
        resched_curr+0x60/0x9c
        check_preempt_curr+0x70/0xdc
        wake_up_new_task+0x310/0x470
        _do_fork+0x188/0x78c
        SyS_clone+0x44/0x50
        __sys_trace_return+0x0/0x4

 other info that might help us debug this:

 Chain exists of:
   (console_sem).lock --> &p->pi_lock --> &rq->lock

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&rq->lock);
                                lock(&p->pi_lock);
                                lock(&rq->lock);
   lock((console_sem).lock);

  *** DEADLOCK ***

 2 locks held by dynamic_debug01/1873:
  #0:  (&p->pi_lock){-.-.}, at: [<000000001366df53>] wake_up_new_task+0x40/0x470
  #1:  (&rq->lock){-.-.}, at: [<00000000842e1587>] __task_rq_lock+0x54/0xdc

 stack backtrace:
 CPU: 10 PID: 1873 Comm: dynamic_debug01 Tainted: G        W        4.15.0+ #1
 Hardware name: GIGABYTE R120-T34-00/MT30-GS2-00, BIOS T48 10/02/2017
 Call trace:
  dump_backtrace+0x0/0x188
  show_stack+0x24/0x2c
  dump_stack+0xa4/0xe0
  print_circular_bug.isra.31+0x29c/0x2b8
  check_prev_add.constprop.39+0x6c8/0x6dc
  validate_chain.isra.34+0x6e4/0xa20
  __lock_acquire+0x3b4/0x6e0
  lock_acquire+0xf4/0x2a8
  _raw_spin_lock_irqsave+0x58/0x70
  down_trylock+0x20/0x4c
  __down_trylock_console_sem+0x3c/0x9c
  console_trylock+0x20/0xb0
  vprintk_emit+0x254/0x390
  vprintk_default+0x58/0x90
  vprintk_func+0xbc/0x164
  printk+0x80/0xa0
  __dynamic_pr_debug+0x84/0xac
  gic_raise_softirq+0x184/0x18c
  smp_cross_call+0xac/0x218
  smp_send_reschedule+0x3c/0x48
  resched_curr+0x60/0x9c
  check_preempt_curr+0x70/0xdc
  wake_up_new_task+0x310/0x470
  _do_fork+0x188/0x78c
  SyS_clone+0x44/0x50
  __sys_trace_return+0x0/0x4
 GICv3: CPU0: ICC_SGI1R_EL1 12000

This could be fixed with printk_deferred() but that might lessen its
usefulness for debugging. So change it to pr_devel to keep it out of
production kernels. Developers working on gic-v3 can enable it as
needed in their kernels.

Signed-off-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index a57c0fbbd34a4..d71be9a1f9d28 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -673,7 +673,7 @@ static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
 	       MPIDR_TO_SGI_RS(cluster_id)		|
 	       tlist << ICC_SGI1R_TARGET_LIST_SHIFT);
 
-	pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
+	pr_devel("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
 	gic_write_sgi1r(val);
 }
 

From 21ec30c0ef5234fb1039cc7c7737d885bf875a9e Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Wed, 31 Jan 2018 18:03:42 -0600
Subject: [PATCH 663/676] irqchip/gic-v3: Use wmb() instead of smb_wmb() in
 gic_raise_softirq()

A DMB instruction can be used to ensure the relative order of only
memory accesses before and after the barrier. Since writes to system
registers are not memory operations, barrier DMB is not sufficient
for observability of memory accesses that occur before ICC_SGI1R_EL1
writes.

A DSB instruction ensures that no instructions that appear in program
order after the DSB instruction, can execute until the DSB instruction
has completed.

Cc: stable@vger.kernel.org
Acked-by: Will Deacon <will.deacon@arm.com>,
Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index d71be9a1f9d28..d99cc07903ec4 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -688,7 +688,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 	 * Ensure that stores to Normal memory are visible to the
 	 * other CPUs before issuing the IPI.
 	 */
-	smp_wmb();
+	wmb();
 
 	for_each_cpu(cpu, mask) {
 		u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu));

From 95a2562590c2f64a0398183f978d5cf3db6d0284 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 1 Feb 2018 09:03:29 -0800
Subject: [PATCH 664/676] irqchip/gic-v3: Ignore disabled ITS nodes

On some platforms there's an ITS available but it's not enabled
because reading or writing the registers is denied by the
firmware. In fact, reading or writing them will cause the system
to reset. We could remove the node from DT in such a case, but
it's better to skip nodes that are marked as "disabled" in DT so
that we can describe the hardware that exists and use the status
property to indicate how the firmware has configured things.

Cc: Stuart Yoder <stuyoder@gmail.com>
Cc: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Rajendra Nayak <rnayak@codeaurora.org>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v3-its-pci-msi.c               | 2 ++
 drivers/irqchip/irq-gic-v3-its-platform-msi.c          | 2 ++
 drivers/irqchip/irq-gic-v3-its.c                       | 2 ++
 drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
index 14a8c0a7e095e..25a98de5cfb28 100644
--- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c
@@ -132,6 +132,8 @@ static int __init its_pci_of_msi_init(void)
 
 	for (np = of_find_matching_node(NULL, its_device_id); np;
 	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_device_is_available(np))
+			continue;
 		if (!of_property_read_bool(np, "msi-controller"))
 			continue;
 
diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
index 833a90fe33aed..8881a053c173e 100644
--- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c
@@ -154,6 +154,8 @@ static void __init its_pmsi_of_init(void)
 
 	for (np = of_find_matching_node(NULL, its_device_id); np;
 	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_device_is_available(np))
+			continue;
 		if (!of_property_read_bool(np, "msi-controller"))
 			continue;
 
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 06f025fd5726f..1d3056f537472 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3314,6 +3314,8 @@ static int __init its_of_probe(struct device_node *node)
 
 	for (np = of_find_matching_node(node, its_device_id); np;
 	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_device_is_available(np))
+			continue;
 		if (!of_property_read_bool(np, "msi-controller")) {
 			pr_warn("%pOF: no msi-controller property, ITS ignored\n",
 				np);
diff --git a/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c b/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c
index 5064d5ddf581c..fc2013aade51b 100644
--- a/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c
+++ b/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c
@@ -73,6 +73,8 @@ static int __init its_fsl_mc_msi_init(void)
 
 	for (np = of_find_matching_node(NULL, its_device_id); np;
 	     np = of_find_matching_node(np, its_device_id)) {
+		if (!of_device_is_available(np))
+			continue;
 		if (!of_property_read_bool(np, "msi-controller"))
 			continue;
 

From de337ee301422756dff43d6c60fbb0400c1235e9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 6 Feb 2018 18:55:33 +0000
Subject: [PATCH 665/676] irqchip/gic-v2m: Add PCI Multi-MSI support

We'd never implemented Multi-MSI support with GICv2m, because
it is weird and clunky, and you'd think people would rather use
MSI-X.

Turns out there is still plenty of devices out there that rely
on Multi-MSI. Oh well, let's teach that trick to the v2m widget,
it is not a big deal anyway.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-gic-v2m.c | 46 +++++++++++++++++------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index 993a8426a4538..1ff38aff9f29f 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -94,7 +94,7 @@ static struct irq_chip gicv2m_msi_irq_chip = {
 
 static struct msi_domain_info gicv2m_msi_domain_info = {
 	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		   MSI_FLAG_PCI_MSIX),
+		   MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI),
 	.chip	= &gicv2m_msi_irq_chip,
 };
 
@@ -155,18 +155,12 @@ static int gicv2m_irq_gic_domain_alloc(struct irq_domain *domain,
 	return 0;
 }
 
-static void gicv2m_unalloc_msi(struct v2m_data *v2m, unsigned int hwirq)
+static void gicv2m_unalloc_msi(struct v2m_data *v2m, unsigned int hwirq,
+			       int nr_irqs)
 {
-	int pos;
-
-	pos = hwirq - v2m->spi_start;
-	if (pos < 0 || pos >= v2m->nr_spis) {
-		pr_err("Failed to teardown msi. Invalid hwirq %d\n", hwirq);
-		return;
-	}
-
 	spin_lock(&v2m_lock);
-	__clear_bit(pos, v2m->bm);
+	bitmap_release_region(v2m->bm, hwirq - v2m->spi_start,
+			      get_count_order(nr_irqs));
 	spin_unlock(&v2m_lock);
 }
 
@@ -174,13 +168,13 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 				   unsigned int nr_irqs, void *args)
 {
 	struct v2m_data *v2m = NULL, *tmp;
-	int hwirq, offset, err = 0;
+	int hwirq, offset, i, err = 0;
 
 	spin_lock(&v2m_lock);
 	list_for_each_entry(tmp, &v2m_nodes, entry) {
-		offset = find_first_zero_bit(tmp->bm, tmp->nr_spis);
-		if (offset < tmp->nr_spis) {
-			__set_bit(offset, tmp->bm);
+		offset = bitmap_find_free_region(tmp->bm, tmp->nr_spis,
+						 get_count_order(nr_irqs));
+		if (offset >= 0) {
 			v2m = tmp;
 			break;
 		}
@@ -192,16 +186,21 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 	hwirq = v2m->spi_start + offset;
 
-	err = gicv2m_irq_gic_domain_alloc(domain, virq, hwirq);
-	if (err) {
-		gicv2m_unalloc_msi(v2m, hwirq);
-		return err;
-	}
+	for (i = 0; i < nr_irqs; i++) {
+		err = gicv2m_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
+		if (err)
+			goto fail;
 
-	irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
-				      &gicv2m_irq_chip, v2m);
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+					      &gicv2m_irq_chip, v2m);
+	}
 
 	return 0;
+
+fail:
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+	gicv2m_unalloc_msi(v2m, hwirq, get_count_order(nr_irqs));
+	return err;
 }
 
 static void gicv2m_irq_domain_free(struct irq_domain *domain,
@@ -210,8 +209,7 @@ static void gicv2m_irq_domain_free(struct irq_domain *domain,
 	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
 	struct v2m_data *v2m = irq_data_get_irq_chip_data(d);
 
-	BUG_ON(nr_irqs != 1);
-	gicv2m_unalloc_msi(v2m, d->hwirq);
+	gicv2m_unalloc_msi(v2m, d->hwirq, nr_irqs);
 	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 

From 2d02424e89eca71b3fa5e832e6fbe467a413e3d5 Mon Sep 17 00:00:00 2001
From: Jaedon Shin <jaedon.shin@gmail.com>
Date: Mon, 12 Feb 2018 11:18:12 +0900
Subject: [PATCH 666/676] irqchip/bcm: Remove hashed address printing

Since commit ad67b74d2469 ("printk: hash addresses printed with %p")
pointers are being hashed when printed. Displaying the virtual memory at
bootup time is not helpful. so delete the prints.

Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jaedon Shin <jaedon.shin@gmail.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 drivers/irqchip/irq-bcm7038-l1.c | 3 ---
 drivers/irqchip/irq-bcm7120-l2.c | 3 ---
 drivers/irqchip/irq-brcmstb-l2.c | 3 ---
 3 files changed, 9 deletions(-)

diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c
index 55cfb986225be..faf734ff4cf3b 100644
--- a/drivers/irqchip/irq-bcm7038-l1.c
+++ b/drivers/irqchip/irq-bcm7038-l1.c
@@ -339,9 +339,6 @@ int __init bcm7038_l1_of_init(struct device_node *dn,
 		goto out_unmap;
 	}
 
-	pr_info("registered BCM7038 L1 intc (mem: 0x%p, IRQs: %d)\n",
-		intc->cpus[0]->map_base, IRQS_PER_WORD * intc->n_words);
-
 	return 0;
 
 out_unmap:
diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c
index 983640eba418e..8968e5e93fcb8 100644
--- a/drivers/irqchip/irq-bcm7120-l2.c
+++ b/drivers/irqchip/irq-bcm7120-l2.c
@@ -318,9 +318,6 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn,
 		}
 	}
 
-	pr_info("registered %s intc (mem: 0x%p, parent IRQ(s): %d)\n",
-			intc_name, data->map_base[0], data->num_parent_irqs);
-
 	return 0;
 
 out_free_domain:
diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c
index 691d20eb0bec1..0e65f609352ec 100644
--- a/drivers/irqchip/irq-brcmstb-l2.c
+++ b/drivers/irqchip/irq-brcmstb-l2.c
@@ -262,9 +262,6 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np,
 		ct->chip.irq_set_wake = irq_gc_set_wake;
 	}
 
-	pr_info("registered L2 intc (mem: 0x%p, parent irq: %d)\n",
-			base, parent_irq);
-
 	return 0;
 
 out_free_domain:

From 0b24a0bbe2147815d982d9335c41bb10c04f40bc Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 14 Feb 2018 17:47:35 +0200
Subject: [PATCH 667/676] irqdomain: Re-use DEFINE_SHOW_ATTRIBUTE() macro

...instead of open coding file operations followed by custom ->open()
callbacks per each attribute.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/irqdomain.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e6a9c36470ee9..82b8b18ee1ebc 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1726,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p)
 	irq_domain_debug_show_one(m, d, 0);
 	return 0;
 }
-
-static int irq_domain_debug_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, irq_domain_debug_show, inode->i_private);
-}
-
-static const struct file_operations dfs_domain_ops = {
-	.open		= irq_domain_debug_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
 
 static void debugfs_add_domain_dir(struct irq_domain *d)
 {
 	if (!d->name || !domain_dir || d->debugfs_file)
 		return;
 	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
-					      &dfs_domain_ops);
+					      &irq_domain_debug_fops);
 }
 
 static void debugfs_remove_domain_dir(struct irq_domain *d)
@@ -1760,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root)
 	if (!domain_dir)
 		return;
 
-	debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops);
+	debugfs_create_file("default", 0444, domain_dir, NULL,
+			    &irq_domain_debug_fops);
 	mutex_lock(&irq_domain_mutex);
 	list_for_each_entry(d, &irq_domain_list, link)
 		debugfs_add_domain_dir(d);

From 8dd601fa8317243be887458c49f6c29c2f3d719f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 15 Feb 2018 20:00:15 +1100
Subject: [PATCH 668/676] dm: correctly handle chained bios in dec_pending()

dec_pending() is given an error status (possibly 0) to be recorded
against a bio.  It can be called several times on the one 'struct
dm_io', and it is careful to only assign a non-zero error to
io->status.  However when it then assigned io->status to bio->bi_status,
it is not careful and could overwrite a genuine error status with 0.

This can happen when chained bios are in use.  If a bio is chained
beneath the bio that this dm_io is handling, the child bio might
complete and set bio->bi_status before the dm_io completes.

This has been possible since chained bios were introduced in 3.14, and
has become a lot easier to trigger with commit 18a25da84354 ("dm: ensure
bio submission follows a depth-first tree walk") as that commit caused
dm to start using chained bios itself.

A particular failure mode is that if a bio spans an 'error' target and a
working target, the 'error' fragment will complete instantly and set the
->bi_status, and the other fragment will normally complete a little
later, and will clear ->bi_status.

The fix is simply to only assign io_error to bio->bi_status when
io_error is not zero.

Reported-and-tested-by: Milan Broz <gmazyland@gmail.com>
Cc: stable@vger.kernel.org (v3.14+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d6de00f367efd..68136806d3658 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -903,7 +903,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			bio->bi_status = io_error;
+			if (io_error)
+				bio->bi_status = io_error;
 			bio_endio(bio);
 		}
 	}

From af27d9403f5b80685b79c88425086edccecaf711 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 16 Feb 2018 16:25:53 +0100
Subject: [PATCH 669/676] mm: hide a #warning for COMPILE_TEST

We get a warning about some slow configurations in randconfig kernels:

  mm/memory.c:83:2: error: #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. [-Werror=cpp]

The warning is reasonable by itself, but gets in the way of randconfig
build testing, so I'm hiding it whenever CONFIG_COMPILE_TEST is set.

The warning was added in 2013 in commit 75980e97dacc ("mm: fold
page->_last_nid into page->flags where possible").

Cc: stable@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index dd8de96f55475..5fcfc24904d19 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -80,7 +80,7 @@
 
 #include "internal.h"
 
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 

From 20a004e7b017cce282a46ac5d02c2b9c6b9bb1fa Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 15 Feb 2018 11:14:56 +0000
Subject: [PATCH 670/676] arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing
 page tables

In many cases, page tables can be accessed concurrently by either another
CPU (due to things like fast gup) or by the hardware page table walker
itself, which may set access/dirty bits. In such cases, it is important
to use READ_ONCE/WRITE_ONCE when accessing page table entries so that
entries cannot be torn, merged or subject to apparent loss of coherence
due to compiler transformations.

Whilst there are some scenarios where this cannot happen (e.g. pinned
kernel mappings for the linear region), the overhead of using READ_ONCE
/WRITE_ONCE everywhere is minimal and makes the code an awful lot easier
to reason about. This patch consistently uses these macros in the arch
code, as well as explicitly namespacing pointers to page table entries
from the entries themselves by using adopting a 'p' suffix for the former
(as is sometimes used elsewhere in the kernel source).

Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Tested-by: Richard Ruigrok <rruigrok@codeaurora.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hugetlb.h     |   2 +-
 arch/arm64/include/asm/kvm_mmu.h     |  26 +--
 arch/arm64/include/asm/mmu_context.h |   4 +-
 arch/arm64/include/asm/pgalloc.h     |  44 ++---
 arch/arm64/include/asm/pgtable.h     |  23 ++-
 arch/arm64/kernel/efi.c              |   2 +-
 arch/arm64/kernel/hibernate.c        | 148 +++++++-------
 arch/arm64/mm/dump.c                 |  54 ++---
 arch/arm64/mm/fault.c                |  44 +++--
 arch/arm64/mm/hugetlbpage.c          |  94 ++++-----
 arch/arm64/mm/kasan_init.c           |  70 +++----
 arch/arm64/mm/mmu.c                  | 282 ++++++++++++++-------------
 arch/arm64/mm/pageattr.c             |  32 +--
 13 files changed, 426 insertions(+), 399 deletions(-)

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 1dca41bea16ad..e73f685696246 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -22,7 +22,7 @@
 
 static inline pte_t huge_ptep_get(pte_t *ptep)
 {
-	return *ptep;
+	return READ_ONCE(*ptep);
 }
 
 
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 9679067a15746..7faed6e48b462 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -185,42 +185,42 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
 	return pmd;
 }
 
-static inline void kvm_set_s2pte_readonly(pte_t *pte)
+static inline void kvm_set_s2pte_readonly(pte_t *ptep)
 {
 	pteval_t old_pteval, pteval;
 
-	pteval = READ_ONCE(pte_val(*pte));
+	pteval = READ_ONCE(pte_val(*ptep));
 	do {
 		old_pteval = pteval;
 		pteval &= ~PTE_S2_RDWR;
 		pteval |= PTE_S2_RDONLY;
-		pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval);
+		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
 	} while (pteval != old_pteval);
 }
 
-static inline bool kvm_s2pte_readonly(pte_t *pte)
+static inline bool kvm_s2pte_readonly(pte_t *ptep)
 {
-	return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY;
+	return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
 }
 
-static inline bool kvm_s2pte_exec(pte_t *pte)
+static inline bool kvm_s2pte_exec(pte_t *ptep)
 {
-	return !(pte_val(*pte) & PTE_S2_XN);
+	return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
 }
 
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmd)
+static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
 {
-	kvm_set_s2pte_readonly((pte_t *)pmd);
+	kvm_set_s2pte_readonly((pte_t *)pmdp);
 }
 
-static inline bool kvm_s2pmd_readonly(pmd_t *pmd)
+static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
 {
-	return kvm_s2pte_readonly((pte_t *)pmd);
+	return kvm_s2pte_readonly((pte_t *)pmdp);
 }
 
-static inline bool kvm_s2pmd_exec(pmd_t *pmd)
+static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
 {
-	return !(pmd_val(*pmd) & PMD_S2_XN);
+	return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
 }
 
 static inline bool kvm_page_empty(void *ptr)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 8d3331985d2e3..39ec0b8a689ee 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -141,13 +141,13 @@ static inline void cpu_install_idmap(void)
  * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
  * avoiding the possibility of conflicting TLB entries being allocated.
  */
-static inline void cpu_replace_ttbr1(pgd_t *pgd)
+static inline void cpu_replace_ttbr1(pgd_t *pgdp)
 {
 	typedef void (ttbr_replace_func)(phys_addr_t);
 	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
 	ttbr_replace_func *replace_phys;
 
-	phys_addr_t pgd_phys = virt_to_phys(pgd);
+	phys_addr_t pgd_phys = virt_to_phys(pgdp);
 
 	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index e9d9f1b006efe..2e05bcd944c83 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -36,23 +36,23 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return (pmd_t *)__get_free_page(PGALLOC_GFP);
 }
 
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp)
 {
-	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-	free_page((unsigned long)pmd);
+	BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1));
+	free_page((unsigned long)pmdp);
 }
 
-static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
+static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 {
-	set_pud(pud, __pud(__phys_to_pud_val(pmd) | prot));
+	set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
 }
 
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
 {
-	__pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE);
+	__pud_populate(pudp, __pa(pmdp), PMD_TYPE_TABLE);
 }
 #else
-static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot)
+static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 {
 	BUILD_BUG();
 }
@@ -65,30 +65,30 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 	return (pud_t *)__get_free_page(PGALLOC_GFP);
 }
 
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+static inline void pud_free(struct mm_struct *mm, pud_t *pudp)
 {
-	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-	free_page((unsigned long)pud);
+	BUG_ON((unsigned long)pudp & (PAGE_SIZE-1));
+	free_page((unsigned long)pudp);
 }
 
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
 {
-	set_pgd(pgdp, __pgd(__phys_to_pgd_val(pud) | prot));
+	set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot));
 }
 
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp)
 {
-	__pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE);
+	__pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
 }
 #else
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot)
+static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
 {
 	BUILD_BUG();
 }
 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);
 
 static inline pte_t *
 pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr)
@@ -114,10 +114,10 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 /*
  * Free a PTE table.
  */
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *ptep)
 {
-	if (pte)
-		free_page((unsigned long)pte);
+	if (ptep)
+		free_page((unsigned long)ptep);
 }
 
 static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
@@ -126,10 +126,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
 	__free_page(pte);
 }
 
-static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
+static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep,
 				  pmdval_t prot)
 {
-	set_pmd(pmdp, __pmd(__phys_to_pmd_val(pte) | prot));
+	set_pmd(pmdp, __pmd(__phys_to_pmd_val(ptep) | prot));
 }
 
 /*
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 094374c82db08..7e2c27e63cd89 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -218,7 +218,7 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
 
 static inline void set_pte(pte_t *ptep, pte_t pte)
 {
-	*ptep = pte;
+	WRITE_ONCE(*ptep, pte);
 
 	/*
 	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
@@ -250,6 +250,8 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
 static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
+	pte_t old_pte;
+
 	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
 		__sync_icache_dcache(pte, addr);
 
@@ -258,14 +260,15 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 * hardware updates of the pte (ptep_set_access_flags safely changes
 	 * valid ptes without going through an invalid entry).
 	 */
-	if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) &&
+	old_pte = READ_ONCE(*ptep);
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(old_pte) && pte_valid(pte) &&
 	   (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) {
 		VM_WARN_ONCE(!pte_young(pte),
 			     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
-			     __func__, pte_val(*ptep), pte_val(pte));
-		VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(pte),
+			     __func__, pte_val(old_pte), pte_val(pte));
+		VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
 			     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
-			     __func__, pte_val(*ptep), pte_val(pte));
+			     __func__, pte_val(old_pte), pte_val(pte));
 	}
 
 	set_pte(ptep, pte);
@@ -431,7 +434,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
-	*pmdp = pmd;
+	WRITE_ONCE(*pmdp, pmd);
 	dsb(ishst);
 	isb();
 }
@@ -482,7 +485,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
 
 static inline void set_pud(pud_t *pudp, pud_t pud)
 {
-	*pudp = pud;
+	WRITE_ONCE(*pudp, pud);
 	dsb(ishst);
 	isb();
 }
@@ -500,7 +503,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 /* Find an entry in the second-level page table. */
 #define pmd_index(addr)		(((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
 
-#define pmd_offset_phys(dir, addr)	(pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t))
+#define pmd_offset_phys(dir, addr)	(pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t))
 #define pmd_offset(dir, addr)		((pmd_t *)__va(pmd_offset_phys((dir), (addr))))
 
 #define pmd_set_fixmap(addr)		((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
@@ -535,7 +538,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
 
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-	*pgdp = pgd;
+	WRITE_ONCE(*pgdp, pgd);
 	dsb(ishst);
 }
 
@@ -552,7 +555,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 /* Find an entry in the frst-level page table. */
 #define pud_index(addr)		(((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
 
-#define pud_offset_phys(dir, addr)	(pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t))
+#define pud_offset_phys(dir, addr)	(pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))
 #define pud_offset(dir, addr)		((pud_t *)__va(pud_offset_phys((dir), (addr))))
 
 #define pud_set_fixmap(addr)		((pud_t *)set_fixmap_offset(FIX_PUD, addr))
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index f85ac58d08a35..a8bf1c892b906 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -90,7 +90,7 @@ static int __init set_permissions(pte_t *ptep, pgtable_t token,
 				  unsigned long addr, void *data)
 {
 	efi_memory_desc_t *md = data;
-	pte_t pte = *ptep;
+	pte_t pte = READ_ONCE(*ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index f20cf7e992495..1ec5f28c39fc5 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -202,10 +202,10 @@ static int create_safe_exec_page(void *src_start, size_t length,
 				 gfp_t mask)
 {
 	int rc = 0;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
 	unsigned long dst = (unsigned long)allocator(mask);
 
 	if (!dst) {
@@ -216,38 +216,38 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	memcpy((void *)dst, src_start, length);
 	flush_icache_range(dst, dst + length);
 
-	pgd = pgd_offset_raw(allocator(mask), dst_addr);
-	if (pgd_none(*pgd)) {
-		pud = allocator(mask);
-		if (!pud) {
+	pgdp = pgd_offset_raw(allocator(mask), dst_addr);
+	if (pgd_none(READ_ONCE(*pgdp))) {
+		pudp = allocator(mask);
+		if (!pudp) {
 			rc = -ENOMEM;
 			goto out;
 		}
-		pgd_populate(&init_mm, pgd, pud);
+		pgd_populate(&init_mm, pgdp, pudp);
 	}
 
-	pud = pud_offset(pgd, dst_addr);
-	if (pud_none(*pud)) {
-		pmd = allocator(mask);
-		if (!pmd) {
+	pudp = pud_offset(pgdp, dst_addr);
+	if (pud_none(READ_ONCE(*pudp))) {
+		pmdp = allocator(mask);
+		if (!pmdp) {
 			rc = -ENOMEM;
 			goto out;
 		}
-		pud_populate(&init_mm, pud, pmd);
+		pud_populate(&init_mm, pudp, pmdp);
 	}
 
-	pmd = pmd_offset(pud, dst_addr);
-	if (pmd_none(*pmd)) {
-		pte = allocator(mask);
-		if (!pte) {
+	pmdp = pmd_offset(pudp, dst_addr);
+	if (pmd_none(READ_ONCE(*pmdp))) {
+		ptep = allocator(mask);
+		if (!ptep) {
 			rc = -ENOMEM;
 			goto out;
 		}
-		pmd_populate_kernel(&init_mm, pmd, pte);
+		pmd_populate_kernel(&init_mm, pmdp, ptep);
 	}
 
-	pte = pte_offset_kernel(pmd, dst_addr);
-	set_pte(pte, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC));
+	ptep = pte_offset_kernel(pmdp, dst_addr);
+	set_pte(ptep, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC));
 
 	/*
 	 * Load our new page tables. A strict BBM approach requires that we
@@ -263,7 +263,7 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	 */
 	cpu_set_reserved_ttbr0();
 	local_flush_tlb_all();
-	write_sysreg(phys_to_ttbr(virt_to_phys(pgd)), ttbr0_el1);
+	write_sysreg(phys_to_ttbr(virt_to_phys(pgdp)), ttbr0_el1);
 	isb();
 
 	*phys_dst_addr = virt_to_phys((void *)dst);
@@ -320,9 +320,9 @@ int swsusp_arch_suspend(void)
 	return ret;
 }
 
-static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr)
+static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = *src_pte;
+	pte_t pte = READ_ONCE(*src_ptep);
 
 	if (pte_valid(pte)) {
 		/*
@@ -330,7 +330,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr)
 		 * read only (code, rodata). Clear the RDONLY bit from
 		 * the temporary mappings we use during restore.
 		 */
-		set_pte(dst_pte, pte_mkwrite(pte));
+		set_pte(dst_ptep, pte_mkwrite(pte));
 	} else if (debug_pagealloc_enabled() && !pte_none(pte)) {
 		/*
 		 * debug_pagealloc will removed the PTE_VALID bit if
@@ -343,112 +343,116 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr)
 		 */
 		BUG_ON(!pfn_valid(pte_pfn(pte)));
 
-		set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte)));
+		set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
 	}
 }
 
-static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start,
+static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start,
 		    unsigned long end)
 {
-	pte_t *src_pte;
-	pte_t *dst_pte;
+	pte_t *src_ptep;
+	pte_t *dst_ptep;
 	unsigned long addr = start;
 
-	dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC);
-	if (!dst_pte)
+	dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC);
+	if (!dst_ptep)
 		return -ENOMEM;
-	pmd_populate_kernel(&init_mm, dst_pmd, dst_pte);
-	dst_pte = pte_offset_kernel(dst_pmd, start);
+	pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep);
+	dst_ptep = pte_offset_kernel(dst_pmdp, start);
 
-	src_pte = pte_offset_kernel(src_pmd, start);
+	src_ptep = pte_offset_kernel(src_pmdp, start);
 	do {
-		_copy_pte(dst_pte, src_pte, addr);
-	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+		_copy_pte(dst_ptep, src_ptep, addr);
+	} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
 
 	return 0;
 }
 
-static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start,
+static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start,
 		    unsigned long end)
 {
-	pmd_t *src_pmd;
-	pmd_t *dst_pmd;
+	pmd_t *src_pmdp;
+	pmd_t *dst_pmdp;
 	unsigned long next;
 	unsigned long addr = start;
 
-	if (pud_none(*dst_pud)) {
-		dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
-		if (!dst_pmd)
+	if (pud_none(READ_ONCE(*dst_pudp))) {
+		dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC);
+		if (!dst_pmdp)
 			return -ENOMEM;
-		pud_populate(&init_mm, dst_pud, dst_pmd);
+		pud_populate(&init_mm, dst_pudp, dst_pmdp);
 	}
-	dst_pmd = pmd_offset(dst_pud, start);
+	dst_pmdp = pmd_offset(dst_pudp, start);
 
-	src_pmd = pmd_offset(src_pud, start);
+	src_pmdp = pmd_offset(src_pudp, start);
 	do {
+		pmd_t pmd = READ_ONCE(*src_pmdp);
+
 		next = pmd_addr_end(addr, end);
-		if (pmd_none(*src_pmd))
+		if (pmd_none(pmd))
 			continue;
-		if (pmd_table(*src_pmd)) {
-			if (copy_pte(dst_pmd, src_pmd, addr, next))
+		if (pmd_table(pmd)) {
+			if (copy_pte(dst_pmdp, src_pmdp, addr, next))
 				return -ENOMEM;
 		} else {
-			set_pmd(dst_pmd,
-				__pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY));
+			set_pmd(dst_pmdp,
+				__pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
 		}
-	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
+	} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
 
 	return 0;
 }
 
-static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start,
+static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
 		    unsigned long end)
 {
-	pud_t *dst_pud;
-	pud_t *src_pud;
+	pud_t *dst_pudp;
+	pud_t *src_pudp;
 	unsigned long next;
 	unsigned long addr = start;
 
-	if (pgd_none(*dst_pgd)) {
-		dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
-		if (!dst_pud)
+	if (pgd_none(READ_ONCE(*dst_pgdp))) {
+		dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC);
+		if (!dst_pudp)
 			return -ENOMEM;
-		pgd_populate(&init_mm, dst_pgd, dst_pud);
+		pgd_populate(&init_mm, dst_pgdp, dst_pudp);
 	}
-	dst_pud = pud_offset(dst_pgd, start);
+	dst_pudp = pud_offset(dst_pgdp, start);
 
-	src_pud = pud_offset(src_pgd, start);
+	src_pudp = pud_offset(src_pgdp, start);
 	do {
+		pud_t pud = READ_ONCE(*src_pudp);
+
 		next = pud_addr_end(addr, end);
-		if (pud_none(*src_pud))
+		if (pud_none(pud))
 			continue;
-		if (pud_table(*(src_pud))) {
-			if (copy_pmd(dst_pud, src_pud, addr, next))
+		if (pud_table(pud)) {
+			if (copy_pmd(dst_pudp, src_pudp, addr, next))
 				return -ENOMEM;
 		} else {
-			set_pud(dst_pud,
-				__pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY));
+			set_pud(dst_pudp,
+				__pud(pud_val(pud) & ~PMD_SECT_RDONLY));
 		}
-	} while (dst_pud++, src_pud++, addr = next, addr != end);
+	} while (dst_pudp++, src_pudp++, addr = next, addr != end);
 
 	return 0;
 }
 
-static int copy_page_tables(pgd_t *dst_pgd, unsigned long start,
+static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start,
 			    unsigned long end)
 {
 	unsigned long next;
 	unsigned long addr = start;
-	pgd_t *src_pgd = pgd_offset_k(start);
+	pgd_t *src_pgdp = pgd_offset_k(start);
 
-	dst_pgd = pgd_offset_raw(dst_pgd, start);
+	dst_pgdp = pgd_offset_raw(dst_pgdp, start);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (pgd_none(*src_pgd))
+		if (pgd_none(READ_ONCE(*src_pgdp)))
 			continue;
-		if (copy_pud(dst_pgd, src_pgd, addr, next))
+		if (copy_pud(dst_pgdp, src_pgdp, addr, next))
 			return -ENOMEM;
-	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+	} while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
 
 	return 0;
 }
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 7b60d62ac5939..65dfc8571bf83 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -286,48 +286,52 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
 
 }
 
-static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+static void walk_pte(struct pg_state *st, pmd_t *pmdp, unsigned long start)
 {
-	pte_t *pte = pte_offset_kernel(pmd, 0UL);
+	pte_t *ptep = pte_offset_kernel(pmdp, 0UL);
 	unsigned long addr;
 	unsigned i;
 
-	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+	for (i = 0; i < PTRS_PER_PTE; i++, ptep++) {
 		addr = start + i * PAGE_SIZE;
-		note_page(st, addr, 4, pte_val(*pte));
+		note_page(st, addr, 4, READ_ONCE(pte_val(*ptep)));
 	}
 }
 
-static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+static void walk_pmd(struct pg_state *st, pud_t *pudp, unsigned long start)
 {
-	pmd_t *pmd = pmd_offset(pud, 0UL);
+	pmd_t *pmdp = pmd_offset(pudp, 0UL);
 	unsigned long addr;
 	unsigned i;
 
-	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+		pmd_t pmd = READ_ONCE(*pmdp);
+
 		addr = start + i * PMD_SIZE;
-		if (pmd_none(*pmd) || pmd_sect(*pmd)) {
-			note_page(st, addr, 3, pmd_val(*pmd));
+		if (pmd_none(pmd) || pmd_sect(pmd)) {
+			note_page(st, addr, 3, pmd_val(pmd));
 		} else {
-			BUG_ON(pmd_bad(*pmd));
-			walk_pte(st, pmd, addr);
+			BUG_ON(pmd_bad(pmd));
+			walk_pte(st, pmdp, addr);
 		}
 	}
 }
 
-static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+static void walk_pud(struct pg_state *st, pgd_t *pgdp, unsigned long start)
 {
-	pud_t *pud = pud_offset(pgd, 0UL);
+	pud_t *pudp = pud_offset(pgdp, 0UL);
 	unsigned long addr;
 	unsigned i;
 
-	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+	for (i = 0; i < PTRS_PER_PUD; i++, pudp++) {
+		pud_t pud = READ_ONCE(*pudp);
+
 		addr = start + i * PUD_SIZE;
-		if (pud_none(*pud) || pud_sect(*pud)) {
-			note_page(st, addr, 2, pud_val(*pud));
+		if (pud_none(pud) || pud_sect(pud)) {
+			note_page(st, addr, 2, pud_val(pud));
 		} else {
-			BUG_ON(pud_bad(*pud));
-			walk_pmd(st, pud, addr);
+			BUG_ON(pud_bad(pud));
+			walk_pmd(st, pudp, addr);
 		}
 	}
 }
@@ -335,17 +339,19 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
 static void walk_pgd(struct pg_state *st, struct mm_struct *mm,
 		     unsigned long start)
 {
-	pgd_t *pgd = pgd_offset(mm, 0UL);
+	pgd_t *pgdp = pgd_offset(mm, 0UL);
 	unsigned i;
 	unsigned long addr;
 
-	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+	for (i = 0; i < PTRS_PER_PGD; i++, pgdp++) {
+		pgd_t pgd = READ_ONCE(*pgdp);
+
 		addr = start + i * PGDIR_SIZE;
-		if (pgd_none(*pgd)) {
-			note_page(st, addr, 1, pgd_val(*pgd));
+		if (pgd_none(pgd)) {
+			note_page(st, addr, 1, pgd_val(pgd));
 		} else {
-			BUG_ON(pgd_bad(*pgd));
-			walk_pud(st, pgd, addr);
+			BUG_ON(pgd_bad(pgd));
+			walk_pud(st, pgdp, addr);
 		}
 	}
 }
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index f76bb2c3c9434..bff11553eb050 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -130,7 +130,8 @@ static void mem_abort_decode(unsigned int esr)
 void show_pte(unsigned long addr)
 {
 	struct mm_struct *mm;
-	pgd_t *pgd;
+	pgd_t *pgdp;
+	pgd_t pgd;
 
 	if (addr < TASK_SIZE) {
 		/* TTBR0 */
@@ -149,33 +150,37 @@ void show_pte(unsigned long addr)
 		return;
 	}
 
-	pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = %p\n",
+	pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgdp = %p\n",
 		 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
 		 VA_BITS, mm->pgd);
-	pgd = pgd_offset(mm, addr);
-	pr_alert("[%016lx] *pgd=%016llx", addr, pgd_val(*pgd));
+	pgdp = pgd_offset(mm, addr);
+	pgd = READ_ONCE(*pgdp);
+	pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
 
 	do {
-		pud_t *pud;
-		pmd_t *pmd;
-		pte_t *pte;
+		pud_t *pudp, pud;
+		pmd_t *pmdp, pmd;
+		pte_t *ptep, pte;
 
-		if (pgd_none(*pgd) || pgd_bad(*pgd))
+		if (pgd_none(pgd) || pgd_bad(pgd))
 			break;
 
-		pud = pud_offset(pgd, addr);
-		pr_cont(", *pud=%016llx", pud_val(*pud));
-		if (pud_none(*pud) || pud_bad(*pud))
+		pudp = pud_offset(pgdp, addr);
+		pud = READ_ONCE(*pudp);
+		pr_cont(", pud=%016llx", pud_val(pud));
+		if (pud_none(pud) || pud_bad(pud))
 			break;
 
-		pmd = pmd_offset(pud, addr);
-		pr_cont(", *pmd=%016llx", pmd_val(*pmd));
-		if (pmd_none(*pmd) || pmd_bad(*pmd))
+		pmdp = pmd_offset(pudp, addr);
+		pmd = READ_ONCE(*pmdp);
+		pr_cont(", pmd=%016llx", pmd_val(pmd));
+		if (pmd_none(pmd) || pmd_bad(pmd))
 			break;
 
-		pte = pte_offset_map(pmd, addr);
-		pr_cont(", *pte=%016llx", pte_val(*pte));
-		pte_unmap(pte);
+		ptep = pte_offset_map(pmdp, addr);
+		pte = READ_ONCE(*ptep);
+		pr_cont(", pte=%016llx", pte_val(pte));
+		pte_unmap(ptep);
 	} while(0);
 
 	pr_cont("\n");
@@ -196,8 +201,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 			  pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
+	pte_t pte = READ_ONCE(*ptep);
 
-	if (pte_same(*ptep, entry))
+	if (pte_same(pte, entry))
 		return 0;
 
 	/* only preserve the access flags and write permission */
@@ -210,7 +216,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 	 * (calculated as: a & b == ~(~a | ~b)).
 	 */
 	pte_val(entry) ^= PTE_RDONLY;
-	pteval = READ_ONCE(pte_val(*ptep));
+	pteval = pte_val(pte);
 	do {
 		old_pteval = pteval;
 		pteval ^= PTE_RDONLY;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 6cb0fa92a6516..ecc6818191df9 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -54,14 +54,14 @@ static inline pgprot_t pte_pgprot(pte_t pte)
 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep, size_t *pgsize)
 {
-	pgd_t *pgd = pgd_offset(mm, addr);
-	pud_t *pud;
-	pmd_t *pmd;
+	pgd_t *pgdp = pgd_offset(mm, addr);
+	pud_t *pudp;
+	pmd_t *pmdp;
 
 	*pgsize = PAGE_SIZE;
-	pud = pud_offset(pgd, addr);
-	pmd = pmd_offset(pud, addr);
-	if ((pte_t *)pmd == ptep) {
+	pudp = pud_offset(pgdp, addr);
+	pmdp = pmd_offset(pudp, addr);
+	if ((pte_t *)pmdp == ptep) {
 		*pgsize = PMD_SIZE;
 		return CONT_PMDS;
 	}
@@ -181,11 +181,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	clear_flush(mm, addr, ptep, pgsize, ncontig);
 
-	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) {
-		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
-			 pte_val(pfn_pte(pfn, hugeprot)));
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
-	}
 }
 
 void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -203,20 +200,20 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 		      unsigned long addr, unsigned long sz)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pte_t *pte = NULL;
-
-	pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz);
-	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep = NULL;
+
+	pgdp = pgd_offset(mm, addr);
+	pudp = pud_alloc(mm, pgdp, addr);
+	if (!pudp)
 		return NULL;
 
 	if (sz == PUD_SIZE) {
-		pte = (pte_t *)pud;
+		ptep = (pte_t *)pudp;
 	} else if (sz == (PAGE_SIZE * CONT_PTES)) {
-		pmd_t *pmd = pmd_alloc(mm, pud, addr);
+		pmdp = pmd_alloc(mm, pudp, addr);
 
 		WARN_ON(addr & (sz - 1));
 		/*
@@ -226,60 +223,55 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 		 * will be no pte_unmap() to correspond with this
 		 * pte_alloc_map().
 		 */
-		pte = pte_alloc_map(mm, pmd, addr);
+		ptep = pte_alloc_map(mm, pmdp, addr);
 	} else if (sz == PMD_SIZE) {
 		if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
-		    pud_none(*pud))
-			pte = huge_pmd_share(mm, addr, pud);
+		    pud_none(READ_ONCE(*pudp)))
+			ptep = huge_pmd_share(mm, addr, pudp);
 		else
-			pte = (pte_t *)pmd_alloc(mm, pud, addr);
+			ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
 	} else if (sz == (PMD_SIZE * CONT_PMDS)) {
-		pmd_t *pmd;
-
-		pmd = pmd_alloc(mm, pud, addr);
+		pmdp = pmd_alloc(mm, pudp, addr);
 		WARN_ON(addr & (sz - 1));
-		return (pte_t *)pmd;
+		return (pte_t *)pmdp;
 	}
 
-	pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr,
-	       sz, pte, pte_val(*pte));
-	return pte;
+	return ptep;
 }
 
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	pgd_t *pgdp;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
 
-	pgd = pgd_offset(mm, addr);
-	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
-	if (!pgd_present(*pgd))
+	pgdp = pgd_offset(mm, addr);
+	if (!pgd_present(READ_ONCE(*pgdp)))
 		return NULL;
 
-	pud = pud_offset(pgd, addr);
-	if (sz != PUD_SIZE && pud_none(*pud))
+	pudp = pud_offset(pgdp, addr);
+	pud = READ_ONCE(*pudp);
+	if (sz != PUD_SIZE && pud_none(pud))
 		return NULL;
 	/* hugepage or swap? */
-	if (pud_huge(*pud) || !pud_present(*pud))
-		return (pte_t *)pud;
+	if (pud_huge(pud) || !pud_present(pud))
+		return (pte_t *)pudp;
 	/* table; check the next level */
 
 	if (sz == CONT_PMD_SIZE)
 		addr &= CONT_PMD_MASK;
 
-	pmd = pmd_offset(pud, addr);
+	pmdp = pmd_offset(pudp, addr);
+	pmd = READ_ONCE(*pmdp);
 	if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) &&
-	    pmd_none(*pmd))
+	    pmd_none(pmd))
 		return NULL;
-	if (pmd_huge(*pmd) || !pmd_present(*pmd))
-		return (pte_t *)pmd;
+	if (pmd_huge(pmd) || !pmd_present(pmd))
+		return (pte_t *)pmdp;
 
-	if (sz == CONT_PTE_SIZE) {
-		pte_t *pte = pte_offset_kernel(pmd, (addr & CONT_PTE_MASK));
-		return pte;
-	}
+	if (sz == CONT_PTE_SIZE)
+		return pte_offset_kernel(pmdp, (addr & CONT_PTE_MASK));
 
 	return NULL;
 }
@@ -367,7 +359,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(*ptep)) {
+	if (!pte_cont(READ_ONCE(*ptep))) {
 		ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
@@ -391,7 +383,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(*ptep)) {
+	if (!pte_cont(READ_ONCE(*ptep))) {
 		ptep_clear_flush(vma, addr, ptep);
 		return;
 	}
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 6e02e6fb4c7b9..dabfc1ecda3d3 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -44,92 +44,92 @@ static phys_addr_t __init kasan_alloc_zeroed_page(int node)
 	return __pa(p);
 }
 
-static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node,
+static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node,
 				      bool early)
 {
-	if (pmd_none(*pmd)) {
+	if (pmd_none(READ_ONCE(*pmdp))) {
 		phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte)
 					     : kasan_alloc_zeroed_page(node);
-		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
+		__pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
 	}
 
-	return early ? pte_offset_kimg(pmd, addr)
-		     : pte_offset_kernel(pmd, addr);
+	return early ? pte_offset_kimg(pmdp, addr)
+		     : pte_offset_kernel(pmdp, addr);
 }
 
-static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node,
+static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node,
 				      bool early)
 {
-	if (pud_none(*pud)) {
+	if (pud_none(READ_ONCE(*pudp))) {
 		phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd)
 					     : kasan_alloc_zeroed_page(node);
-		__pud_populate(pud, pmd_phys, PMD_TYPE_TABLE);
+		__pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE);
 	}
 
-	return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr);
+	return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr);
 }
 
-static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node,
+static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node,
 				      bool early)
 {
-	if (pgd_none(*pgd)) {
+	if (pgd_none(READ_ONCE(*pgdp))) {
 		phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud)
 					     : kasan_alloc_zeroed_page(node);
-		__pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE);
+		__pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE);
 	}
 
-	return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr);
+	return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr);
 }
 
-static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr,
+static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 				      unsigned long end, int node, bool early)
 {
 	unsigned long next;
-	pte_t *pte = kasan_pte_offset(pmd, addr, node, early);
+	pte_t *ptep = kasan_pte_offset(pmdp, addr, node, early);
 
 	do {
 		phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page)
 					      : kasan_alloc_zeroed_page(node);
 		next = addr + PAGE_SIZE;
-		set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (pte++, addr = next, addr != end && pte_none(*pte));
+		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
+	} while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
 }
 
-static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr,
+static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
 				      unsigned long end, int node, bool early)
 {
 	unsigned long next;
-	pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early);
+	pmd_t *pmdp = kasan_pmd_offset(pudp, addr, node, early);
 
 	do {
 		next = pmd_addr_end(addr, end);
-		kasan_pte_populate(pmd, addr, next, node, early);
-	} while (pmd++, addr = next, addr != end && pmd_none(*pmd));
+		kasan_pte_populate(pmdp, addr, next, node, early);
+	} while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp)));
 }
 
-static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr,
+static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr,
 				      unsigned long end, int node, bool early)
 {
 	unsigned long next;
-	pud_t *pud = kasan_pud_offset(pgd, addr, node, early);
+	pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early);
 
 	do {
 		next = pud_addr_end(addr, end);
-		kasan_pmd_populate(pud, addr, next, node, early);
-	} while (pud++, addr = next, addr != end && pud_none(*pud));
+		kasan_pmd_populate(pudp, addr, next, node, early);
+	} while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp)));
 }
 
 static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
 				      int node, bool early)
 {
 	unsigned long next;
-	pgd_t *pgd;
+	pgd_t *pgdp;
 
-	pgd = pgd_offset_k(addr);
+	pgdp = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		kasan_pud_populate(pgd, addr, next, node, early);
-	} while (pgd++, addr = next, addr != end);
+		kasan_pud_populate(pgdp, addr, next, node, early);
+	} while (pgdp++, addr = next, addr != end);
 }
 
 /* The early shadow maps everything to a single page of zeroes */
@@ -155,14 +155,14 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end,
  */
 void __init kasan_copy_shadow(pgd_t *pgdir)
 {
-	pgd_t *pgd, *pgd_new, *pgd_end;
+	pgd_t *pgdp, *pgdp_new, *pgdp_end;
 
-	pgd = pgd_offset_k(KASAN_SHADOW_START);
-	pgd_end = pgd_offset_k(KASAN_SHADOW_END);
-	pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
+	pgdp = pgd_offset_k(KASAN_SHADOW_START);
+	pgdp_end = pgd_offset_k(KASAN_SHADOW_END);
+	pgdp_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START);
 	do {
-		set_pgd(pgd_new, *pgd);
-	} while (pgd++, pgd_new++, pgd != pgd_end);
+		set_pgd(pgdp_new, READ_ONCE(*pgdp));
+	} while (pgdp++, pgdp_new++, pgdp != pgdp_end);
 }
 
 static void __init clear_pgds(unsigned long start,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4694cda823c95..3161b853f29e1 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -125,45 +125,48 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
 	return ((old ^ new) & ~mask) == 0;
 }
 
-static void init_pte(pmd_t *pmd, unsigned long addr, unsigned long end,
+static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot)
 {
-	pte_t *pte;
+	pte_t *ptep;
 
-	pte = pte_set_fixmap_offset(pmd, addr);
+	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = *pte;
+		pte_t old_pte = READ_ONCE(*ptep);
 
-		set_pte(pte, pfn_pte(__phys_to_pfn(phys), prot));
+		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
 		/*
 		 * After the PTE entry has been populated once, we
 		 * only allow updates to the permission attributes.
 		 */
-		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte)));
+		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
+					      READ_ONCE(pte_val(*ptep))));
 
 		phys += PAGE_SIZE;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
 
 	pte_clear_fixmap();
 }
 
-static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr,
+static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 				unsigned long end, phys_addr_t phys,
 				pgprot_t prot,
 				phys_addr_t (*pgtable_alloc)(void),
 				int flags)
 {
 	unsigned long next;
+	pmd_t pmd = READ_ONCE(*pmdp);
 
-	BUG_ON(pmd_sect(*pmd));
-	if (pmd_none(*pmd)) {
+	BUG_ON(pmd_sect(pmd));
+	if (pmd_none(pmd)) {
 		phys_addr_t pte_phys;
 		BUG_ON(!pgtable_alloc);
 		pte_phys = pgtable_alloc();
-		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
+		__pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
+		pmd = READ_ONCE(*pmdp);
 	}
-	BUG_ON(pmd_bad(*pmd));
+	BUG_ON(pmd_bad(pmd));
 
 	do {
 		pgprot_t __prot = prot;
@@ -175,67 +178,69 @@ static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pte(pmd, addr, next, phys, __prot);
+		init_pte(pmdp, addr, next, phys, __prot);
 
 		phys += next - addr;
 	} while (addr = next, addr != end);
 }
 
-static void init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
+static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot,
 		     phys_addr_t (*pgtable_alloc)(void), int flags)
 {
 	unsigned long next;
-	pmd_t *pmd;
+	pmd_t *pmdp;
 
-	pmd = pmd_set_fixmap_offset(pud, addr);
+	pmdp = pmd_set_fixmap_offset(pudp, addr);
 	do {
-		pmd_t old_pmd = *pmd;
+		pmd_t old_pmd = READ_ONCE(*pmdp);
 
 		next = pmd_addr_end(addr, end);
 
 		/* try section mapping first */
 		if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
-			pmd_set_huge(pmd, phys, prot);
+			pmd_set_huge(pmdp, phys, prot);
 
 			/*
 			 * After the PMD entry has been populated once, we
 			 * only allow updates to the permission attributes.
 			 */
 			BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
-						      pmd_val(*pmd)));
+						      READ_ONCE(pmd_val(*pmdp))));
 		} else {
-			alloc_init_cont_pte(pmd, addr, next, phys, prot,
+			alloc_init_cont_pte(pmdp, addr, next, phys, prot,
 					    pgtable_alloc, flags);
 
 			BUG_ON(pmd_val(old_pmd) != 0 &&
-			       pmd_val(old_pmd) != pmd_val(*pmd));
+			       pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
 		}
 		phys += next - addr;
-	} while (pmd++, addr = next, addr != end);
+	} while (pmdp++, addr = next, addr != end);
 
 	pmd_clear_fixmap();
 }
 
-static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr,
+static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 				unsigned long end, phys_addr_t phys,
 				pgprot_t prot,
 				phys_addr_t (*pgtable_alloc)(void), int flags)
 {
 	unsigned long next;
+	pud_t pud = READ_ONCE(*pudp);
 
 	/*
 	 * Check for initial section mappings in the pgd/pud.
 	 */
-	BUG_ON(pud_sect(*pud));
-	if (pud_none(*pud)) {
+	BUG_ON(pud_sect(pud));
+	if (pud_none(pud)) {
 		phys_addr_t pmd_phys;
 		BUG_ON(!pgtable_alloc);
 		pmd_phys = pgtable_alloc();
-		__pud_populate(pud, pmd_phys, PUD_TYPE_TABLE);
+		__pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
+		pud = READ_ONCE(*pudp);
 	}
-	BUG_ON(pud_bad(*pud));
+	BUG_ON(pud_bad(pud));
 
 	do {
 		pgprot_t __prot = prot;
@@ -247,7 +252,7 @@ static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr,
 		    (flags & NO_CONT_MAPPINGS) == 0)
 			__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
-		init_pmd(pud, addr, next, phys, __prot, pgtable_alloc, flags);
+		init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
 
 		phys += next - addr;
 	} while (addr = next, addr != end);
@@ -265,25 +270,27 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next,
 	return true;
 }
 
-static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
-				  phys_addr_t phys, pgprot_t prot,
-				  phys_addr_t (*pgtable_alloc)(void),
-				  int flags)
+static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
+			   phys_addr_t phys, pgprot_t prot,
+			   phys_addr_t (*pgtable_alloc)(void),
+			   int flags)
 {
-	pud_t *pud;
 	unsigned long next;
+	pud_t *pudp;
+	pgd_t pgd = READ_ONCE(*pgdp);
 
-	if (pgd_none(*pgd)) {
+	if (pgd_none(pgd)) {
 		phys_addr_t pud_phys;
 		BUG_ON(!pgtable_alloc);
 		pud_phys = pgtable_alloc();
-		__pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE);
+		__pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
+		pgd = READ_ONCE(*pgdp);
 	}
-	BUG_ON(pgd_bad(*pgd));
+	BUG_ON(pgd_bad(pgd));
 
-	pud = pud_set_fixmap_offset(pgd, addr);
+	pudp = pud_set_fixmap_offset(pgdp, addr);
 	do {
-		pud_t old_pud = *pud;
+		pud_t old_pud = READ_ONCE(*pudp);
 
 		next = pud_addr_end(addr, end);
 
@@ -292,23 +299,23 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end,
 		 */
 		if (use_1G_block(addr, next, phys) &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
-			pud_set_huge(pud, phys, prot);
+			pud_set_huge(pudp, phys, prot);
 
 			/*
 			 * After the PUD entry has been populated once, we
 			 * only allow updates to the permission attributes.
 			 */
 			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
-						      pud_val(*pud)));
+						      READ_ONCE(pud_val(*pudp))));
 		} else {
-			alloc_init_cont_pmd(pud, addr, next, phys, prot,
+			alloc_init_cont_pmd(pudp, addr, next, phys, prot,
 					    pgtable_alloc, flags);
 
 			BUG_ON(pud_val(old_pud) != 0 &&
-			       pud_val(old_pud) != pud_val(*pud));
+			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
 		}
 		phys += next - addr;
-	} while (pud++, addr = next, addr != end);
+	} while (pudp++, addr = next, addr != end);
 
 	pud_clear_fixmap();
 }
@@ -320,7 +327,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 				 int flags)
 {
 	unsigned long addr, length, end, next;
-	pgd_t *pgd = pgd_offset_raw(pgdir, virt);
+	pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
 
 	/*
 	 * If the virtual and physical address don't have the same offset
@@ -336,10 +343,10 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
 	end = addr + length;
 	do {
 		next = pgd_addr_end(addr, end);
-		alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc,
+		alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
 			       flags);
 		phys += next - addr;
-	} while (pgd++, addr = next, addr != end);
+	} while (pgdp++, addr = next, addr != end);
 }
 
 static phys_addr_t pgd_pgtable_alloc(void)
@@ -401,10 +408,10 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 	flush_tlb_kernel_range(virt, virt + size);
 }
 
-static void __init __map_memblock(pgd_t *pgd, phys_addr_t start,
+static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
 				  phys_addr_t end, pgprot_t prot, int flags)
 {
-	__create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start,
+	__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
 			     prot, early_pgtable_alloc, flags);
 }
 
@@ -418,7 +425,7 @@ void __init mark_linear_text_alias_ro(void)
 			    PAGE_KERNEL_RO);
 }
 
-static void __init map_mem(pgd_t *pgd)
+static void __init map_mem(pgd_t *pgdp)
 {
 	phys_addr_t kernel_start = __pa_symbol(_text);
 	phys_addr_t kernel_end = __pa_symbol(__init_begin);
@@ -451,7 +458,7 @@ static void __init map_mem(pgd_t *pgd)
 		if (memblock_is_nomap(reg))
 			continue;
 
-		__map_memblock(pgd, start, end, PAGE_KERNEL, flags);
+		__map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
 	}
 
 	/*
@@ -464,7 +471,7 @@ static void __init map_mem(pgd_t *pgd)
 	 * Note that contiguous mappings cannot be remapped in this way,
 	 * so we should avoid them here.
 	 */
-	__map_memblock(pgd, kernel_start, kernel_end,
+	__map_memblock(pgdp, kernel_start, kernel_end,
 		       PAGE_KERNEL, NO_CONT_MAPPINGS);
 	memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
 
@@ -475,7 +482,7 @@ static void __init map_mem(pgd_t *pgd)
 	 * through /sys/kernel/kexec_crash_size interface.
 	 */
 	if (crashk_res.end) {
-		__map_memblock(pgd, crashk_res.start, crashk_res.end + 1,
+		__map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
 			       PAGE_KERNEL,
 			       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
 		memblock_clear_nomap(crashk_res.start,
@@ -499,7 +506,7 @@ void mark_rodata_ro(void)
 	debug_checkwx();
 }
 
-static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
+static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
 				      pgprot_t prot, struct vm_struct *vma,
 				      int flags, unsigned long vm_flags)
 {
@@ -509,7 +516,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
 	BUG_ON(!PAGE_ALIGNED(pa_start));
 	BUG_ON(!PAGE_ALIGNED(size));
 
-	__create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
+	__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
 			     early_pgtable_alloc, flags);
 
 	if (!(vm_flags & VM_NO_GUARD))
@@ -562,7 +569,7 @@ core_initcall(map_entry_trampoline);
 /*
  * Create fine-grained mappings for the kernel.
  */
-static void __init map_kernel(pgd_t *pgd)
+static void __init map_kernel(pgd_t *pgdp)
 {
 	static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
 				vmlinux_initdata, vmlinux_data;
@@ -578,24 +585,24 @@ static void __init map_kernel(pgd_t *pgd)
 	 * Only rodata will be remapped with different permissions later on,
 	 * all other segments are allowed to use contiguous mappings.
 	 */
-	map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0,
+	map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
 			   VM_NO_GUARD);
-	map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL,
+	map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
 			   &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
-	map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot,
+	map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
 			   &vmlinux_inittext, 0, VM_NO_GUARD);
-	map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL,
+	map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
 			   &vmlinux_initdata, 0, VM_NO_GUARD);
-	map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
+	map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
 
-	if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
+	if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
 		/*
 		 * The fixmap falls in a separate pgd to the kernel, and doesn't
 		 * live in the carveout for the swapper_pg_dir. We can simply
 		 * re-use the existing dir for the fixmap.
 		 */
-		set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
-			*pgd_offset_k(FIXADDR_START));
+		set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
+			READ_ONCE(*pgd_offset_k(FIXADDR_START)));
 	} else if (CONFIG_PGTABLE_LEVELS > 3) {
 		/*
 		 * The fixmap shares its top level pgd entry with the kernel
@@ -604,14 +611,15 @@ static void __init map_kernel(pgd_t *pgd)
 		 * entry instead.
 		 */
 		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
-		pud_populate(&init_mm, pud_set_fixmap_offset(pgd, FIXADDR_START),
+		pud_populate(&init_mm,
+			     pud_set_fixmap_offset(pgdp, FIXADDR_START),
 			     lm_alias(bm_pmd));
 		pud_clear_fixmap();
 	} else {
 		BUG();
 	}
 
-	kasan_copy_shadow(pgd);
+	kasan_copy_shadow(pgdp);
 }
 
 /*
@@ -621,10 +629,10 @@ static void __init map_kernel(pgd_t *pgd)
 void __init paging_init(void)
 {
 	phys_addr_t pgd_phys = early_pgtable_alloc();
-	pgd_t *pgd = pgd_set_fixmap(pgd_phys);
+	pgd_t *pgdp = pgd_set_fixmap(pgd_phys);
 
-	map_kernel(pgd);
-	map_mem(pgd);
+	map_kernel(pgdp);
+	map_mem(pgdp);
 
 	/*
 	 * We want to reuse the original swapper_pg_dir so we don't have to
@@ -635,7 +643,7 @@ void __init paging_init(void)
 	 * To do this we need to go via a temporary pgd.
 	 */
 	cpu_replace_ttbr1(__va(pgd_phys));
-	memcpy(swapper_pg_dir, pgd, PGD_SIZE);
+	memcpy(swapper_pg_dir, pgdp, PGD_SIZE);
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
 	pgd_clear_fixmap();
@@ -655,37 +663,40 @@ void __init paging_init(void)
  */
 int kern_addr_valid(unsigned long addr)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	pgd_t *pgdp;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
 
 	if ((((long)addr) >> VA_BITS) != -1UL)
 		return 0;
 
-	pgd = pgd_offset_k(addr);
-	if (pgd_none(*pgd))
+	pgdp = pgd_offset_k(addr);
+	if (pgd_none(READ_ONCE(*pgdp)))
 		return 0;
 
-	pud = pud_offset(pgd, addr);
-	if (pud_none(*pud))
+	pudp = pud_offset(pgdp, addr);
+	pud = READ_ONCE(*pudp);
+	if (pud_none(pud))
 		return 0;
 
-	if (pud_sect(*pud))
-		return pfn_valid(pud_pfn(*pud));
+	if (pud_sect(pud))
+		return pfn_valid(pud_pfn(pud));
 
-	pmd = pmd_offset(pud, addr);
-	if (pmd_none(*pmd))
+	pmdp = pmd_offset(pudp, addr);
+	pmd = READ_ONCE(*pmdp);
+	if (pmd_none(pmd))
 		return 0;
 
-	if (pmd_sect(*pmd))
-		return pfn_valid(pmd_pfn(*pmd));
+	if (pmd_sect(pmd))
+		return pfn_valid(pmd_pfn(pmd));
 
-	pte = pte_offset_kernel(pmd, addr);
-	if (pte_none(*pte))
+	ptep = pte_offset_kernel(pmdp, addr);
+	pte = READ_ONCE(*ptep);
+	if (pte_none(pte))
 		return 0;
 
-	return pfn_valid(pte_pfn(*pte));
+	return pfn_valid(pte_pfn(pte));
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
@@ -700,32 +711,32 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 {
 	unsigned long addr = start;
 	unsigned long next;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
 
 	do {
 		next = pmd_addr_end(addr, end);
 
-		pgd = vmemmap_pgd_populate(addr, node);
-		if (!pgd)
+		pgdp = vmemmap_pgd_populate(addr, node);
+		if (!pgdp)
 			return -ENOMEM;
 
-		pud = vmemmap_pud_populate(pgd, addr, node);
-		if (!pud)
+		pudp = vmemmap_pud_populate(pgdp, addr, node);
+		if (!pudp)
 			return -ENOMEM;
 
-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
+		pmdp = pmd_offset(pudp, addr);
+		if (pmd_none(READ_ONCE(*pmdp))) {
 			void *p = NULL;
 
 			p = vmemmap_alloc_block_buf(PMD_SIZE, node);
 			if (!p)
 				return -ENOMEM;
 
-			pmd_set_huge(pmd, __pa(p), __pgprot(PROT_SECT_NORMAL));
+			pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
 		} else
-			vmemmap_verify((pte_t *)pmd, node, addr, next);
+			vmemmap_verify((pte_t *)pmdp, node, addr, next);
 	} while (addr = next, addr != end);
 
 	return 0;
@@ -739,20 +750,22 @@ void vmemmap_free(unsigned long start, unsigned long end,
 
 static inline pud_t * fixmap_pud(unsigned long addr)
 {
-	pgd_t *pgd = pgd_offset_k(addr);
+	pgd_t *pgdp = pgd_offset_k(addr);
+	pgd_t pgd = READ_ONCE(*pgdp);
 
-	BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd));
+	BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
 
-	return pud_offset_kimg(pgd, addr);
+	return pud_offset_kimg(pgdp, addr);
 }
 
 static inline pmd_t * fixmap_pmd(unsigned long addr)
 {
-	pud_t *pud = fixmap_pud(addr);
+	pud_t *pudp = fixmap_pud(addr);
+	pud_t pud = READ_ONCE(*pudp);
 
-	BUG_ON(pud_none(*pud) || pud_bad(*pud));
+	BUG_ON(pud_none(pud) || pud_bad(pud));
 
-	return pmd_offset_kimg(pud, addr);
+	return pmd_offset_kimg(pudp, addr);
 }
 
 static inline pte_t * fixmap_pte(unsigned long addr)
@@ -768,30 +781,31 @@ static inline pte_t * fixmap_pte(unsigned long addr)
  */
 void __init early_fixmap_init(void)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	pgd_t *pgdp, pgd;
+	pud_t *pudp;
+	pmd_t *pmdp;
 	unsigned long addr = FIXADDR_START;
 
-	pgd = pgd_offset_k(addr);
+	pgdp = pgd_offset_k(addr);
+	pgd = READ_ONCE(*pgdp);
 	if (CONFIG_PGTABLE_LEVELS > 3 &&
-	    !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa_symbol(bm_pud))) {
+	    !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
 		/*
 		 * We only end up here if the kernel mapping and the fixmap
 		 * share the top level pgd entry, which should only happen on
 		 * 16k/4 levels configurations.
 		 */
 		BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
-		pud = pud_offset_kimg(pgd, addr);
+		pudp = pud_offset_kimg(pgdp, addr);
 	} else {
-		if (pgd_none(*pgd))
-			__pgd_populate(pgd, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
-		pud = fixmap_pud(addr);
+		if (pgd_none(pgd))
+			__pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
+		pudp = fixmap_pud(addr);
 	}
-	if (pud_none(*pud))
-		__pud_populate(pud, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
-	pmd = fixmap_pmd(addr);
-	__pmd_populate(pmd, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
+	if (pud_none(READ_ONCE(*pudp)))
+		__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
+	pmdp = fixmap_pmd(addr);
+	__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
 
 	/*
 	 * The boot-ioremap range spans multiple pmds, for which
@@ -800,11 +814,11 @@ void __init early_fixmap_init(void)
 	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
 		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
 
-	if ((pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
-	     || pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+	if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
+	     || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
 		WARN_ON(1);
-		pr_warn("pmd %p != %p, %p\n",
-			pmd, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
+		pr_warn("pmdp %p != %p, %p\n",
+			pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
 			fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
 		pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
 			fix_to_virt(FIX_BTMAP_BEGIN));
@@ -824,16 +838,16 @@ void __set_fixmap(enum fixed_addresses idx,
 			       phys_addr_t phys, pgprot_t flags)
 {
 	unsigned long addr = __fix_to_virt(idx);
-	pte_t *pte;
+	pte_t *ptep;
 
 	BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
 
-	pte = fixmap_pte(addr);
+	ptep = fixmap_pte(addr);
 
 	if (pgprot_val(flags)) {
-		set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
+		set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
 	} else {
-		pte_clear(&init_mm, addr, pte);
+		pte_clear(&init_mm, addr, ptep);
 		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
 	}
 }
@@ -915,36 +929,36 @@ int __init arch_ioremap_pmd_supported(void)
 	return 1;
 }
 
-int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot)
+int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
 	pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT |
 					pgprot_val(mk_sect_prot(prot)));
 	BUG_ON(phys & ~PUD_MASK);
-	set_pud(pud, pfn_pud(__phys_to_pfn(phys), sect_prot));
+	set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot));
 	return 1;
 }
 
-int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot)
+int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
 	pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT |
 					pgprot_val(mk_sect_prot(prot)));
 	BUG_ON(phys & ~PMD_MASK);
-	set_pmd(pmd, pfn_pmd(__phys_to_pfn(phys), sect_prot));
+	set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot));
 	return 1;
 }
 
-int pud_clear_huge(pud_t *pud)
+int pud_clear_huge(pud_t *pudp)
 {
-	if (!pud_sect(*pud))
+	if (!pud_sect(READ_ONCE(*pudp)))
 		return 0;
-	pud_clear(pud);
+	pud_clear(pudp);
 	return 1;
 }
 
-int pmd_clear_huge(pmd_t *pmd)
+int pmd_clear_huge(pmd_t *pmdp)
 {
-	if (!pmd_sect(*pmd))
+	if (!pmd_sect(READ_ONCE(*pmdp)))
 		return 0;
-	pmd_clear(pmd);
+	pmd_clear(pmdp);
 	return 1;
 }
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index a682a0a2a0fa4..a56359373d8b3 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -29,7 +29,7 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr,
 			void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = *ptep;
+	pte_t pte = READ_ONCE(*ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
@@ -156,30 +156,32 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
  */
 bool kernel_page_present(struct page *page)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	pgd_t *pgdp;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep;
 	unsigned long addr = (unsigned long)page_address(page);
 
-	pgd = pgd_offset_k(addr);
-	if (pgd_none(*pgd))
+	pgdp = pgd_offset_k(addr);
+	if (pgd_none(READ_ONCE(*pgdp)))
 		return false;
 
-	pud = pud_offset(pgd, addr);
-	if (pud_none(*pud))
+	pudp = pud_offset(pgdp, addr);
+	pud = READ_ONCE(*pudp);
+	if (pud_none(pud))
 		return false;
-	if (pud_sect(*pud))
+	if (pud_sect(pud))
 		return true;
 
-	pmd = pmd_offset(pud, addr);
-	if (pmd_none(*pmd))
+	pmdp = pmd_offset(pudp, addr);
+	pmd = READ_ONCE(*pmdp);
+	if (pmd_none(pmd))
 		return false;
-	if (pmd_sect(*pmd))
+	if (pmd_sect(pmd))
 		return true;
 
-	pte = pte_offset_kernel(pmd, addr);
-	return pte_valid(*pte);
+	ptep = pte_offset_kernel(pmdp, addr);
+	return pte_valid(READ_ONCE(*ptep));
 }
 #endif /* CONFIG_HIBERNATION */
 #endif /* CONFIG_DEBUG_PAGEALLOC */

From e1a50de37860b3a93a9d643b09638db5aff47650 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 16 Feb 2018 17:04:23 +0000
Subject: [PATCH 671/676] arm64: cputype: Silence Sparse warnings

Sparse makes a fair bit of noise about our MPIDR mask being implicitly
long - let's explicitly describe it as such rather than just relying on
the value forcing automatic promotion.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cputype.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index be7bd19c87ec2..eda8c5f629fc8 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -20,7 +20,7 @@
 
 #define MPIDR_UP_BITMASK	(0x1 << 30)
 #define MPIDR_MT_BITMASK	(0x1 << 24)
-#define MPIDR_HWID_BITMASK	0xff00ffffff
+#define MPIDR_HWID_BITMASK	0xff00ffffffUL
 
 #define MPIDR_LEVEL_BITS_SHIFT	3
 #define MPIDR_LEVEL_BITS	(1 << MPIDR_LEVEL_BITS_SHIFT)

From 29fee6eed2811ff1089b30fc579a2d19d78016ab Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Fri, 2 Feb 2018 17:42:33 +0000
Subject: [PATCH 672/676] xenbus: track caller request id

Commit fd8aa9095a95 ("xen: optimize xenbus driver for multiple concurrent
xenstore accesses") optimized xenbus concurrent accesses but in doing so
broke UABI of /dev/xen/xenbus. Through /dev/xen/xenbus applications are in
charge of xenbus message exchange with the correct header and body. Now,
after the mentioned commit the replies received by application will no
longer have the header req_id echoed back as it was on request (see
specification below for reference), because that particular field is being
overwritten by kernel.

struct xsd_sockmsg
{
  uint32_t type;  /* XS_??? */
  uint32_t req_id;/* Request identifier, echoed in daemon's response.  */
  uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */
  uint32_t len;   /* Length of data following this. */

  /* Generally followed by nul-terminated string(s). */
};

Before there was only one request at a time so req_id could simply be
forwarded back and forth. To allow simultaneous requests we need a
different req_id for each message thus kernel keeps a monotonic increasing
counter for this field and is written on every request irrespective of
userspace value.

Forwarding again the req_id on userspace requests is not a solution because
we would open the possibility of userspace-generated req_id colliding with
kernel ones. So this patch instead takes another route which is to
artificially keep user req_id while keeping the xenbus logic as is. We do
that by saving the original req_id before xs_send(), use the private kernel
counter as req_id and then once reply comes and was validated, we restore
back the original req_id.

Cc: <stable@vger.kernel.org> # 4.11
Fixes: fd8aa9095a ("xen: optimize xenbus driver for multiple concurrent xenstore accesses")
Reported-by: Bhavesh Davda <bhavesh.davda@oracle.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/xenbus/xenbus.h       | 1 +
 drivers/xen/xenbus/xenbus_comms.c | 1 +
 drivers/xen/xenbus/xenbus_xs.c    | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
index 149c5e7efc89e..092981171df17 100644
--- a/drivers/xen/xenbus/xenbus.h
+++ b/drivers/xen/xenbus/xenbus.h
@@ -76,6 +76,7 @@ struct xb_req_data {
 	struct list_head list;
 	wait_queue_head_t wq;
 	struct xsd_sockmsg msg;
+	uint32_t caller_req_id;
 	enum xsd_sockmsg_type type;
 	char *body;
 	const struct kvec *vec;
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index 5b081a01779de..d239fc3c5e3de 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -309,6 +309,7 @@ static int process_msg(void)
 			goto out;
 
 		if (req->state == xb_req_state_wait_reply) {
+			req->msg.req_id = req->caller_req_id;
 			req->msg.type = state.msg.type;
 			req->msg.len = state.msg.len;
 			req->body = state.body;
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index 3e59590c7254d..3f3b29398ab8e 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -227,6 +227,8 @@ static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg)
 	req->state = xb_req_state_queued;
 	init_waitqueue_head(&req->wq);
 
+	/* Save the caller req_id and restore it later in the reply */
+	req->caller_req_id = req->msg.req_id;
 	req->msg.req_id = xs_request_enter(req);
 
 	mutex_lock(&xb_write_mutex);
@@ -310,6 +312,7 @@ static void *xs_talkv(struct xenbus_transaction t,
 	req->num_vecs = num_vecs;
 	req->cb = xs_wake_up;
 
+	msg.req_id = 0;
 	msg.tx_id = t.id;
 	msg.type = type;
 	msg.len = 0;

From 63e708f826bb21470155d37b103a75d8a9e25b18 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Wed, 7 Feb 2018 18:49:23 -0500
Subject: [PATCH 673/676] x86/xen: Calculate __max_logical_packages on PV
 domains

The kernel panics on PV domains because native_smp_cpus_done() is
only called for HVM domains.

Calculate __max_logical_packages for PV domains.

Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate")
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Tested-and-reported-by: Simon Gaiser <simon@invisiblethingslab.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Dou Liyang <douly.fnst@cn.fujitsu.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: xen-devel@lists.xenproject.org
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/include/asm/smp.h |  1 +
 arch/x86/kernel/smpboot.c  | 10 ++++++++--
 arch/x86/xen/smp.c         |  2 ++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 461f53d27708a..a4189762b2667 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -129,6 +129,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 void cpu_disable_common(void);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
+void calculate_max_logical_packages(void);
 void native_smp_cpus_done(unsigned int max_cpus);
 void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index cfc61e1d45e2d..9eee25d07586c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1281,11 +1281,10 @@ void __init native_smp_prepare_boot_cpu(void)
 	cpu_set_state_online(me);
 }
 
-void __init native_smp_cpus_done(unsigned int max_cpus)
+void __init calculate_max_logical_packages(void)
 {
 	int ncpus;
 
-	pr_debug("Boot done\n");
 	/*
 	 * Today neither Intel nor AMD support heterogenous systems so
 	 * extrapolate the boot cpu's data to all packages.
@@ -1293,6 +1292,13 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
 	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
 	pr_info("Max logical packages: %u\n", __max_logical_packages);
+}
+
+void __init native_smp_cpus_done(unsigned int max_cpus)
+{
+	pr_debug("Boot done\n");
+
+	calculate_max_logical_packages();
 
 	if (x86_has_numa_in_package)
 		set_sched_topology(x86_numa_in_package_topology);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 77c959cf81e7c..7a43b2ae19f12 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -122,6 +122,8 @@ void __init xen_smp_cpus_done(unsigned int max_cpus)
 
 	if (xen_hvm_domain())
 		native_smp_cpus_done(max_cpus);
+	else
+		calculate_max_logical_packages();
 
 	if (xen_have_vcpu_info_placement)
 		return;

From 64d6871827b1e2ac8c9daf49f2c883378c7d50cd Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <sstabellini@kernel.org>
Date: Wed, 14 Feb 2018 10:28:23 -0800
Subject: [PATCH 674/676] pvcalls-front: introduce a per sock_mapping refcount

Introduce a per sock_mapping refcount, in addition to the existing
global refcount. Thanks to the sock_mapping refcount, we can safely wait
for it to be 1 in pvcalls_front_release before freeing an active socket,
instead of waiting for the global refcount to be 1.

Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
Acked-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/pvcalls-front.c | 191 +++++++++++++++---------------------
 1 file changed, 79 insertions(+), 112 deletions(-)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 753d9cb437d02..11ce470b41a5b 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -60,6 +60,7 @@ struct sock_mapping {
 	bool active_socket;
 	struct list_head list;
 	struct socket *sock;
+	atomic_t refcount;
 	union {
 		struct {
 			int irq;
@@ -93,6 +94,32 @@ struct sock_mapping {
 	};
 };
 
+static inline struct sock_mapping *pvcalls_enter_sock(struct socket *sock)
+{
+	struct sock_mapping *map;
+
+	if (!pvcalls_front_dev ||
+		dev_get_drvdata(&pvcalls_front_dev->dev) == NULL)
+		return ERR_PTR(-ENOTCONN);
+
+	map = (struct sock_mapping *)sock->sk->sk_send_head;
+	if (map == NULL)
+		return ERR_PTR(-ENOTSOCK);
+
+	pvcalls_enter();
+	atomic_inc(&map->refcount);
+	return map;
+}
+
+static inline void pvcalls_exit_sock(struct socket *sock)
+{
+	struct sock_mapping *map;
+
+	map = (struct sock_mapping *)sock->sk->sk_send_head;
+	atomic_dec(&map->refcount);
+	pvcalls_exit();
+}
+
 static inline int get_request(struct pvcalls_bedata *bedata, int *req_id)
 {
 	*req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1);
@@ -369,31 +396,23 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
 	if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
 		return -EOPNOTSUPP;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *)sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	spin_lock(&bedata->socket_lock);
 	ret = get_request(bedata, &req_id);
 	if (ret < 0) {
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	ret = create_active(map, &evtchn);
 	if (ret < 0) {
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 
@@ -423,7 +442,7 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr,
 	smp_rmb();
 	ret = bedata->rsp[req_id].ret;
 	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return ret;
 }
 
@@ -488,23 +507,15 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg,
 	if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB))
 		return -EOPNOTSUPP;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	mutex_lock(&map->active.out_mutex);
 	if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) {
 		mutex_unlock(&map->active.out_mutex);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -EAGAIN;
 	}
 	if (len > INT_MAX)
@@ -526,7 +537,7 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg,
 		tot_sent = sent;
 
 	mutex_unlock(&map->active.out_mutex);
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return tot_sent;
 }
 
@@ -591,19 +602,11 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC))
 		return -EOPNOTSUPP;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	mutex_lock(&map->active.in_mutex);
 	if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER))
 		len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER);
@@ -623,7 +626,7 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		ret = 0;
 
 	mutex_unlock(&map->active.in_mutex);
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return ret;
 }
 
@@ -637,24 +640,16 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM)
 		return -EOPNOTSUPP;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (map == NULL) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	spin_lock(&bedata->socket_lock);
 	ret = get_request(bedata, &req_id);
 	if (ret < 0) {
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	req = RING_GET_REQUEST(&bedata->ring, req_id);
@@ -684,7 +679,7 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
 
 	map->passive.status = PVCALLS_STATUS_BIND;
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return 0;
 }
 
@@ -695,21 +690,13 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
 	struct xen_pvcalls_request *req;
 	int notify, req_id, ret;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	if (map->passive.status != PVCALLS_STATUS_BIND) {
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -EOPNOTSUPP;
 	}
 
@@ -717,7 +704,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
 	ret = get_request(bedata, &req_id);
 	if (ret < 0) {
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	req = RING_GET_REQUEST(&bedata->ring, req_id);
@@ -741,7 +728,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog)
 	bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID;
 
 	map->passive.status = PVCALLS_STATUS_LISTEN;
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return ret;
 }
 
@@ -753,21 +740,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct xen_pvcalls_request *req;
 	int notify, req_id, ret, evtchn, nonblock;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -ENOTCONN;
-	}
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return -ENOTSOCK;
-	}
-
 	if (map->passive.status != PVCALLS_STATUS_LISTEN) {
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -EINVAL;
 	}
 
@@ -785,13 +764,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 			goto received;
 		}
 		if (nonblock) {
-			pvcalls_exit();
+			pvcalls_exit_sock(sock);
 			return -EAGAIN;
 		}
 		if (wait_event_interruptible(map->passive.inflight_accept_req,
 			!test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 					  (void *)&map->passive.flags))) {
-			pvcalls_exit();
+			pvcalls_exit_sock(sock);
 			return -EINTR;
 		}
 	}
@@ -802,7 +781,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	map2 = kzalloc(sizeof(*map2), GFP_ATOMIC);
@@ -810,7 +789,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -ENOMEM;
 	}
 	ret = create_active(map2, &evtchn);
@@ -819,7 +798,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	list_add_tail(&map2->list, &bedata->socket_mappings);
@@ -841,13 +820,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 	/* We could check if we have received a response before returning. */
 	if (nonblock) {
 		WRITE_ONCE(map->passive.inflight_req_id, req_id);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -EAGAIN;
 	}
 
 	if (wait_event_interruptible(bedata->inflight_req,
 		READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) {
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -EINTR;
 	}
 	/* read req_id, then the content */
@@ -862,7 +841,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 		clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT,
 			  (void *)&map->passive.flags);
 		pvcalls_front_free_map(bedata, map2);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return -ENOMEM;
 	}
 	newsock->sk->sk_send_head = (void *)map2;
@@ -874,7 +853,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags)
 	clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags);
 	wake_up(&map->passive.inflight_accept_req);
 
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return ret;
 }
 
@@ -965,23 +944,16 @@ __poll_t pvcalls_front_poll(struct file *file, struct socket *sock,
 	struct sock_mapping *map;
 	__poll_t ret;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map))
 		return EPOLLNVAL;
-	}
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (!map) {
-		pvcalls_exit();
-		return EPOLLNVAL;
-	}
 	if (map->active_socket)
 		ret = pvcalls_front_poll_active(file, bedata, map, wait);
 	else
 		ret = pvcalls_front_poll_passive(file, bedata, map, wait);
-	pvcalls_exit();
+	pvcalls_exit_sock(sock);
 	return ret;
 }
 
@@ -995,25 +967,20 @@ int pvcalls_front_release(struct socket *sock)
 	if (sock->sk == NULL)
 		return 0;
 
-	pvcalls_enter();
-	if (!pvcalls_front_dev) {
-		pvcalls_exit();
-		return -EIO;
+	map = pvcalls_enter_sock(sock);
+	if (IS_ERR(map)) {
+		if (PTR_ERR(map) == -ENOTCONN)
+			return -EIO;
+		else
+			return 0;
 	}
-
 	bedata = dev_get_drvdata(&pvcalls_front_dev->dev);
 
-	map = (struct sock_mapping *) sock->sk->sk_send_head;
-	if (map == NULL) {
-		pvcalls_exit();
-		return 0;
-	}
-
 	spin_lock(&bedata->socket_lock);
 	ret = get_request(bedata, &req_id);
 	if (ret < 0) {
 		spin_unlock(&bedata->socket_lock);
-		pvcalls_exit();
+		pvcalls_exit_sock(sock);
 		return ret;
 	}
 	sock->sk->sk_send_head = NULL;
@@ -1043,10 +1010,10 @@ int pvcalls_front_release(struct socket *sock)
 		/*
 		 * We need to make sure that sendmsg/recvmsg on this socket have
 		 * not started before we've cleared sk_send_head here. The
-		 * easiest (though not optimal) way to guarantee this is to see
-		 * that no pvcall (other than us) is in progress.
+		 * easiest way to guarantee this is to see that no pvcalls
+		 * (other than us) is in progress on this socket.
 		 */
-		while (atomic_read(&pvcalls_refcount) > 1)
+		while (atomic_read(&map->refcount) > 1)
 			cpu_relax();
 
 		pvcalls_front_free_map(bedata, map);

From d1a75e0896f5e9f5cb6a979caaea39f1f4b9feb1 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <sstabellini@kernel.org>
Date: Wed, 14 Feb 2018 10:28:24 -0800
Subject: [PATCH 675/676] pvcalls-front: wait for other operations to return
 when release passive sockets

Passive sockets can have ongoing operations on them, specifically, we
have two wait_event_interruptable calls in pvcalls_front_accept.

Add two wake_up calls in pvcalls_front_release, then wait for the
potential waiters to return and release the sock_mapping refcount.

Signed-off-by: Stefano Stabellini <stefano@aporeto.com>
Acked-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
 drivers/xen/pvcalls-front.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 11ce470b41a5b..aedbee3b28386 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -1018,6 +1018,12 @@ int pvcalls_front_release(struct socket *sock)
 
 		pvcalls_front_free_map(bedata, map);
 	} else {
+		wake_up(&bedata->inflight_req);
+		wake_up(&map->passive.inflight_accept_req);
+
+		while (atomic_read(&map->refcount) > 1)
+			cpu_relax();
+
 		spin_lock(&bedata->socket_lock);
 		list_del(&map->list);
 		spin_unlock(&bedata->socket_lock);

From 91ab883eb21325ad80f3473633f794c78ac87f51 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 18 Feb 2018 17:29:42 -0800
Subject: [PATCH 676/676] Linux 4.16-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 79ad2bfa24b68..d9cf3a40eda9d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 16
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*