From 817a824b75b1475f1b067c8cee318c7b4d66fcde Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 26 Feb 2010 17:16:00 +0000
Subject: [PATCH 1/5] x86, xen: Disable highmem PTE allocation even when
 CONFIG_HIGHPTE=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a path in the pagefault code where the kernel deliberately
breaks its own locking rules by kmapping a high pte page without
holding the pagetable lock (in at least page_check_address). This
breaks Xen's ability to track the pinned/unpinned state of the
page. There does not appear to be a viable workaround for this
behaviour so simply disable HIGHPTE for all Xen guests.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
LKML-Reference: <1267204562-11844-1-git-send-email-ian.campbell@citrix.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Pasi Kärkkäinen <pasik@iki.fi>
Cc: <stable@kernel.org> # .32.x: 14315592: Allow highmem user page tables to be disabled at boot time
Cc: <stable@kernel.org> # .32.x
Cc: <xen-devel@lists.xensource.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/xen/enlighten.c |  7 +++++++
 arch/x86/xen/mmu.c       | 11 ++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 36daccb686420..b607239c1ba83 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -50,6 +50,7 @@
 #include <asm/traps.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
+#include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/reboot.h>
@@ -1094,6 +1095,12 @@ asmlinkage void __init xen_start_kernel(void)
 
 	__supported_pte_mask |= _PAGE_IOMAP;
 
+	/*
+	 * Prevent page tables from being allocated in highmem, even
+	 * if CONFIG_HIGHPTE is enabled.
+	 */
+	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
 	/* Work out if we support NX */
 	x86_configure_nx();
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index bf4cd6bfe959f..350a3deedf254 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1432,14 +1432,15 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
 {
 	pgprot_t prot = PAGE_KERNEL;
 
+	/*
+	 * We disable highmem allocations for page tables so we should never
+	 * see any calls to kmap_atomic_pte on a highmem page.
+	 */
+	BUG_ON(PageHighMem(page));
+
 	if (PagePinned(page))
 		prot = PAGE_KERNEL_RO;
 
-	if (0 && PageHighMem(page))
-		printk("mapping highpte %lx type %d prot %s\n",
-		       page_to_pfn(page), type,
-		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
 	return kmap_atomic_prot(page, type, prot);
 }
 #endif

From 3249b7e1df6380e9d7bb3238f64f445bf614f787 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 26 Feb 2010 17:16:01 +0000
Subject: [PATCH 2/5] x86, vmi: Disable highmem PTE allocation even when
 CONFIG_HIGHPTE=y

Preventing HIGHPTE allocations under VMI will allow us to remove the
kmap_atomic_pte paravirt op.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
LKML-Reference: <1267204562-11844-2-git-send-email-ian.campbell@citrix.com>
Acked-by: Alok Kataria <akataria@vmware.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmi_32.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c301930..58aca86193e58 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -33,6 +33,7 @@
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/apic.h>
+#include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
 #include <asm/vmi_time.h>
@@ -272,19 +273,11 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 	void *va = kmap_atomic(page, type);
 
 	/*
-	 * Internally, the VMI ROM must map virtual addresses to physical
-	 * addresses for processing MMU updates.  By the time MMU updates
-	 * are issued, this information is typically already lost.
-	 * Fortunately, the VMI provides a cache of mapping slots for active
-	 * page tables.
-	 *
-	 * We use slot zero for the linear mapping of physical memory, and
-	 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
-	 *
-	 *  args:                 SLOT                 VA    COUNT PFN
+	 * We disable highmem allocations for page tables so we should never
+	 * see any calls to kmap_atomic_pte on a highmem page.
 	 */
-	BUG_ON(type != KM_PTE0 && type != KM_PTE1);
-	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+	BUG_ON(PageHighmem(page));
 
 	return va;
 }
@@ -640,6 +633,12 @@ static inline int __init activate_vmi(void)
 	u64 reloc;
 	const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
 
+	/*
+	 * Prevent page tables from being allocated in highmem, even if
+	 * CONFIG_HIGHPTE is enabled.
+	 */
+	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+
 	if (call_vrom_func(vmi_rom, vmi_init) != 0) {
 		printk(KERN_ERR "VMI ROM failed to initialize!");
 		return 0;

From dad52fc01161afcb8798c609e009aed4d104927f Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 26 Feb 2010 17:16:02 +0000
Subject: [PATCH 3/5] x86, paravirt: Remove kmap_atomic_pte paravirt op.

Now that both Xen and VMI disable allocations of PTE pages from high
memory this paravirt op serves no further purpose.

This effectively reverts ce6234b5 "add kmap_atomic_pte for mapping
highpte pages".

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
LKML-Reference: <1267204562-11844-3-git-send-email-ian.campbell@citrix.com>
Acked-by: Alok Kataria <akataria@vmware.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/highmem.h        |  4 ----
 arch/x86/include/asm/paravirt.h       |  9 ---------
 arch/x86/include/asm/paravirt_types.h |  4 ----
 arch/x86/include/asm/pgtable_32.h     |  4 ++--
 arch/x86/kernel/paravirt.c            |  4 ----
 arch/x86/kernel/vmi_32.c              | 20 --------------------
 arch/x86/xen/mmu.c                    | 22 ----------------------
 7 files changed, 2 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45e..a726650fc80fb 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 struct page *kmap_atomic_to_page(void *ptr);
 
-#ifndef CONFIG_PARAVIRT
-#define kmap_atomic_pte(page, type)	kmap_atomic(page, type)
-#endif
-
 #define flush_cache_kmaps()	do { } while (0)
 
 extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dd59a85a918fc..5653f43d90e53 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
 	PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
 }
 
-#ifdef CONFIG_HIGHPTE
-static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
-{
-	unsigned long ret;
-	ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
-	return (void *)ret;
-}
-#endif
-
 static inline void pte_update(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b1e70d51e40c8..db9ef55323417 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
 #endif	/* PAGETABLE_LEVELS == 4 */
 #endif	/* PAGETABLE_LEVELS >= 3 */
 
-#ifdef CONFIG_HIGHPTE
-	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
-#endif
-
 	struct pv_lazy_ops lazy_mode;
 
 	/* dom0 ops */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323b..b422d2201af3c 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 	 in_irq() ? KM_IRQ_PTE :	\
 	 KM_PTE0)
 #define pte_offset_map(dir, address)					\
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) +		\
+	((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) +		\
 	 pte_index((address)))
 #define pte_offset_map_nested(dir, address)				\
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) +		\
+	((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) +		\
 	 pte_index((address)))
 #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
 #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d163102..1db183ed7c01c 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.ptep_modify_prot_start = __ptep_modify_prot_start,
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
-#ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = kmap_atomic,
-#endif
-
 #if PAGETABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 58aca86193e58..7dd599deca4a7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -267,22 +267,6 @@ static void vmi_nop(void)
 {
 }
 
-#ifdef CONFIG_HIGHPTE
-static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-	void *va = kmap_atomic(page, type);
-
-	/*
-	 * We disable highmem allocations for page tables so we should never
-	 * see any calls to kmap_atomic_pte on a highmem page.
-	 */
-
-	BUG_ON(PageHighmem(page));
-
-	return va;
-}
-#endif
-
 static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
 {
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -777,10 +761,6 @@ static inline int __init activate_vmi(void)
 
 	/* Set linear is needed in all cases */
 	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
-#ifdef CONFIG_HIGHPTE
-	if (vmi_ops.set_linear_mapping)
-		pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
-#endif
 
 	/*
 	 * These MUST always be patched.  Don't support indirect jumps
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 350a3deedf254..f9eb7de74f429 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1427,24 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-	pgprot_t prot = PAGE_KERNEL;
-
-	/*
-	 * We disable highmem allocations for page tables so we should never
-	 * see any calls to kmap_atomic_pte on a highmem page.
-	 */
-	BUG_ON(PageHighMem(page));
-
-	if (PagePinned(page))
-		prot = PAGE_KERNEL_RO;
-
-	return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
 #ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
@@ -1903,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.alloc_pmd_clone = paravirt_nop,
 	.release_pmd = xen_release_pmd_init,
 
-#ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
 #ifdef CONFIG_X86_64
 	.set_pte = xen_set_pte,
 #else

From 37b99dd5372cff42f83210c280f314f10f99138e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 1 Mar 2010 21:55:51 +0800
Subject: [PATCH 4/5] resource: Fix generic page_is_ram() for partial RAM pages

The System RAM walk shall skip partial RAM pages and avoid calling
func() on them. So that page_is_ram() return 0 for a partial RAM page.

In particular, it shall not call func() with len=0.
This fixes a boot time bug reported by Sachin and root caused by Thomas:

> >>> WARNING: at arch/x86/mm/ioremap.c:111 __ioremap_caller+0x169/0x2f1()
> >>> Hardware name: BladeCenter LS21 -[79716AA]-
> >>> Modules linked in:
> >>> Pid: 0, comm: swapper Not tainted 2.6.33-git6-autotest #1
> >>> Call Trace:
> >>> [<ffffffff81047cff>] ? __ioremap_caller+0x169/0x2f1
> >>> [<ffffffff81063b7d>] warn_slowpath_common+0x77/0xa4
> >>> [<ffffffff81063bb9>] warn_slowpath_null+0xf/0x11
> >>> [<ffffffff81047cff>] __ioremap_caller+0x169/0x2f1
> >>> [<ffffffff813747a3>] ? acpi_os_map_memory+0x12/0x1b
> >>> [<ffffffff81047f10>] ioremap_nocache+0x12/0x14
> >>> [<ffffffff813747a3>] acpi_os_map_memory+0x12/0x1b
> >>> [<ffffffff81282fa0>] acpi_tb_verify_table+0x29/0x5b
> >>> [<ffffffff812827f0>] acpi_load_tables+0x39/0x15a
> >>> [<ffffffff8191c8f8>] acpi_early_init+0x60/0xf5
> >>> [<ffffffff818f2cad>] start_kernel+0x397/0x3a7
> >>> [<ffffffff818f2295>] x86_64_start_reservations+0xa5/0xa9
> >>> [<ffffffff818f237a>] x86_64_start_kernel+0xe1/0xe8
> >>> ---[ end trace 4eaa2a86a8e2da22 ]---
> >>> ioremap reserve_memtype failed -22

The return code is -EINVAL, so it failed in the is_ram check, which is
not too surprising

> BIOS-provided physical RAM map:
>  BIOS-e820: 0000000000000000 - 000000000009c000 (usable)
>  BIOS-e820: 000000000009c000 - 00000000000a0000 (reserved)
>  BIOS-e820: 00000000000e0000 - 0000000000100000 (reserved)
>  BIOS-e820: 0000000000100000 - 00000000cffa3900 (usable)
>  BIOS-e820: 00000000cffa3900 - 00000000cffa7400 (ACPI data)

The ACPI data is not starting on a page boundary and neither does the
usable RAM area end on a page boundary. Very useful !

> ACPI: DSDT 00000000cffa3900 036CE (v01 IBM    SERLEWIS 00001000 INTL 20060912)

ACPI is trying to map DSDT at cffa3900, which results in a check
vs. cffa3000 which is the relevant page boundary. The generic is_ram
check correctly identifies that as RAM because it's in the usable
resource area. The old e820 based is_ram check does not take
overlapping resource areas into account. That's why it works.

CC: Sachin Sant <sachinp@in.ibm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
LKML-Reference: <20100301135551.GA9998@localhost>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/resource.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 03c897f7935e1..8f0e3d0f4bff2 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -274,7 +274,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
 	struct resource res;
-	unsigned long pfn, len;
+	unsigned long pfn, end_pfn;
 	u64 orig_end;
 	int ret = -1;
 
@@ -284,9 +284,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 	orig_end = res.end;
 	while ((res.start < res.end) &&
 		(find_next_system_ram(&res, "System RAM") >= 0)) {
-		pfn = (unsigned long)(res.start >> PAGE_SHIFT);
-		len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
-		ret = (*func)(pfn, len, arg);
+		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		end_pfn = (res.end + 1) >> PAGE_SHIFT;
+		if (end_pfn > pfn)
+		    ret = (*func)(pfn, end_pfn - pfn, arg);
 		if (ret)
 			break;
 		res.start = res.end + 1;

From f41496607e03ab99f263b8e26689ad0fc853007f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 2 Mar 2010 11:21:09 -0800
Subject: [PATCH 5/5] resource: Fix broken indentation

Fix broken indentation in patch
37b99dd5372cff42f83210c280f314f10f99138e.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
LKML-Reference: <20100301135551.GA9998@localhost>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 8f0e3d0f4bff2..91f430fd467ec 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -287,7 +287,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		end_pfn = (res.end + 1) >> PAGE_SHIFT;
 		if (end_pfn > pfn)
-		    ret = (*func)(pfn, end_pfn - pfn, arg);
+			ret = (*func)(pfn, end_pfn - pfn, arg);
 		if (ret)
 			break;
 		res.start = res.end + 1;