From 3b07cabd0e3d214f9008c8623d86ab647516b40d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2009 16:32:22 +0900 Subject: [PATCH] --- yaml --- r: 136706 b: refs/heads/master c: 3ac6cffea4aa18007a454a7442da2855882f403d h: refs/heads/master v: v3 --- [refs] | 2 +- trunk/arch/x86/include/asm/page.h | 3 +- trunk/arch/x86/include/asm/paravirt.h | 453 +++++-------- trunk/arch/x86/include/asm/pgtable.h | 38 +- trunk/arch/x86/kernel/entry_64.S | 2 +- trunk/arch/x86/kernel/paravirt.c | 55 +- trunk/arch/x86/kernel/paravirt_patch_32.c | 12 - trunk/arch/x86/kernel/paravirt_patch_64.c | 15 - trunk/arch/x86/kernel/vmi_32.c | 9 +- trunk/arch/x86/kernel/vsmp_64.c | 12 +- trunk/arch/x86/lguest/boot.c | 13 +- trunk/arch/x86/xen/enlighten.c | 740 ++++++++++++++++++++- trunk/arch/x86/xen/irq.c | 14 +- trunk/arch/x86/xen/mmu.c | 745 ---------------------- trunk/arch/x86/xen/mmu.h | 3 - trunk/arch/x86/xen/xen-ops.h | 8 - trunk/include/asm-generic/vmlinux.lds.h | 22 +- 17 files changed, 950 insertions(+), 1196 deletions(-) diff --git a/[refs] b/[refs] index fe8b3696161b..ff955c5775f5 100644 --- a/[refs] +++ b/[refs] @@ -1,2 +1,2 @@ --- -refs/heads/master: 1f4f931501e9270c156d05ee76b7b872de486304 +refs/heads/master: 3ac6cffea4aa18007a454a7442da2855882f403d diff --git a/trunk/arch/x86/include/asm/page.h b/trunk/arch/x86/include/asm/page.h index 6b9810859daf..e9873a2e8695 100644 --- a/trunk/arch/x86/include/asm/page.h +++ b/trunk/arch/x86/include/asm/page.h @@ -147,7 +147,7 @@ static inline pteval_t native_pte_val(pte_t pte) return pte.pte; } -static inline pteval_t pte_flags(pte_t pte) +static inline pteval_t native_pte_flags(pte_t pte) { return native_pte_val(pte) & PTE_FLAGS_MASK; } @@ -173,6 +173,7 @@ static inline pteval_t pte_flags(pte_t pte) #endif #define pte_val(x) native_pte_val(x) +#define pte_flags(x) native_pte_flags(x) #define __pte(x) native_make_pte(x) #endif /* CONFIG_PARAVIRT */ diff --git a/trunk/arch/x86/include/asm/paravirt.h b/trunk/arch/x86/include/asm/paravirt.h index c85e7475e171..c26c6bf4da00 100644 --- a/trunk/arch/x86/include/asm/paravirt.h +++ b/trunk/arch/x86/include/asm/paravirt.h @@ -12,38 +12,21 @@ #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) #define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) +#ifdef CONFIG_X86_64 +#define CLBR_RSI (1 << 3) +#define CLBR_RDI (1 << 4) #define CLBR_R8 (1 << 5) #define CLBR_R9 (1 << 6) #define CLBR_R10 (1 << 7) #define CLBR_R11 (1 << 8) - #define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - #include +#else +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 3) - 1) #endif /* X86_64 */ -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) - #ifndef __ASSEMBLY__ #include #include @@ -57,14 +40,6 @@ struct tss_struct; struct mm_struct; struct desc_struct; -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - /* general info */ struct pv_info { unsigned int kernel_rpl; @@ -214,15 +189,11 @@ struct pv_irq_ops { * expected to use X86_EFLAGS_IF; all other bits * returned from save_fl are undefined, and may be ignored by * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - + unsigned long (*save_fl)(void); + void (*restore_fl)(unsigned long); + void (*irq_disable)(void); + void (*irq_enable)(void); void (*safe_halt)(void); void (*halt)(void); @@ -308,11 +279,12 @@ struct pv_mmu_ops { void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; + pteval_t (*pte_val)(pte_t); + pteval_t (*pte_flags)(pte_t); + pte_t (*make_pte)(pteval_t pte); - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; + pgdval_t (*pgd_val)(pgd_t); + pgd_t (*make_pgd)(pgdval_t pgd); #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE @@ -327,12 +299,12 @@ struct pv_mmu_ops { void (*set_pud)(pud_t *pudp, pud_t pudval); - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; + pmdval_t (*pmd_val)(pmd_t); + pmd_t (*make_pmd)(pmdval_t pmd); #if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; + pudval_t (*pud_val)(pud_t); + pud_t (*make_pud)(pudval_t pud); void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); #endif /* PAGETABLE_LEVELS == 4 */ @@ -417,8 +389,6 @@ extern struct pv_lock_ops pv_lock_ops; asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); unsigned paravirt_patch_ignore(unsigned len); unsigned paravirt_patch_call(void *insnbuf, const void *target, u16 tgt_clobbers, @@ -510,45 +480,25 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx #define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - #define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ "=c" (__ecx) #define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - #define EXTRA_CLOBBERS #define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx +#else +#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx #define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - #define PVOP_VCALL_CLOBBERS "=D" (__edi), \ "=S" (__esi), "=d" (__edx), \ "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) #define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" #define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ +#endif #ifdef CONFIG_PARAVIRT_DEBUG #define PVOP_TEST_NULL(op) BUG_ON(op == NULL) @@ -556,11 +506,10 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)op) #endif -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ +#define __PVOP_CALL(rettype, op, pre, post, ...) \ ({ \ rettype __ret; \ - PVOP_CALL_ARGS; \ + PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ /* This is 32-bit specific, but is okay in 64-bit */ \ /* since this condition will never hold */ \ @@ -568,113 +517,70 @@ int paravirt_disable_iospace(void); asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : PVOP_CALL_CLOBBERS \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ + paravirt_clobber(CLBR_ANY), \ ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ + : "memory", "cc" EXTRA_CLOBBERS); \ __ret = (rettype)((((u64)__edx) << 32) | __eax); \ } else { \ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : PVOP_CALL_CLOBBERS \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ + paravirt_clobber(CLBR_ANY), \ ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ + : "memory", "cc" EXTRA_CLOBBERS); \ __ret = (rettype)__eax; \ } \ __ret; \ }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ +#define __PVOP_VCALL(op, pre, post, ...) \ ({ \ PVOP_VCALL_ARGS; \ PVOP_TEST_NULL(op); \ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr \ + : PVOP_VCALL_CLOBBERS \ : paravirt_type(op), \ - paravirt_clobber(clbr), \ + paravirt_clobber(CLBR_ANY), \ ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ + : "memory", "cc" VEXTRA_CLOBBERS); \ }) -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - #define PVOP_CALL0(rettype, op) \ __PVOP_CALL(rettype, op, "", "") #define PVOP_VCALL0(op) \ __PVOP_VCALL(op, "", "") -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - #define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) #define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) #define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1" ((unsigned long)(arg2))) #define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1" ((unsigned long)(arg2))) #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) #define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) /* This is the only difference in x86_64. We can make it much simpler */ #ifdef CONFIG_X86_32 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ __PVOP_CALL(rettype, op, \ "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ __PVOP_VCALL(op, \ "push %[_arg4];", "lea 4(%%esp),%%esp;", \ @@ -682,13 +588,13 @@ int paravirt_disable_iospace(void); "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) #else #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) + __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ + "3"((unsigned long)(arg4))) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) + __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ + "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ + "3"((unsigned long)(arg4))) #endif static inline int paravirt_enabled(void) @@ -1155,13 +1061,13 @@ static inline pte_t __pte(pteval_t val) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, - pv_mmu_ops.make_pte, - val, (u64)val >> 32); + ret = PVOP_CALL2(pteval_t, + pv_mmu_ops.make_pte, + val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pteval_t, - pv_mmu_ops.make_pte, - val); + ret = PVOP_CALL1(pteval_t, + pv_mmu_ops.make_pte, + val); return (pte_t) { .pte = ret }; } @@ -1171,12 +1077,29 @@ static inline pteval_t pte_val(pte_t pte) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val, - pte.pte, (u64)pte.pte >> 32); + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, + pte.pte, (u64)pte.pte >> 32); + else + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, + pte.pte); + + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + pteval_t ret; + + if (sizeof(pteval_t) > sizeof(long)) + ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags, + pte.pte, (u64)pte.pte >> 32); else - ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val, - pte.pte); + ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags, + pte.pte); +#ifdef CONFIG_PARAVIRT_DEBUG + BUG_ON(ret & PTE_PFN_MASK); +#endif return ret; } @@ -1185,11 +1108,11 @@ static inline pgd_t __pgd(pgdval_t val) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd, - val, (u64)val >> 32); + ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, + val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd, - val); + ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, + val); return (pgd_t) { ret }; } @@ -1199,11 +1122,11 @@ static inline pgdval_t pgd_val(pgd_t pgd) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd, (u64)pgd.pgd >> 32); + ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd, (u64)pgd.pgd >> 32); else - ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd); + ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd); return ret; } @@ -1267,11 +1190,11 @@ static inline pmd_t __pmd(pmdval_t val) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd, - val, (u64)val >> 32); + ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, + val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd, - val); + ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, + val); return (pmd_t) { ret }; } @@ -1281,11 +1204,11 @@ static inline pmdval_t pmd_val(pmd_t pmd) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd, (u64)pmd.pmd >> 32); + ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd, (u64)pmd.pmd >> 32); else - ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd); + ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd); return ret; } @@ -1307,11 +1230,11 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud, - val, (u64)val >> 32); + ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, + val, (u64)val >> 32); else - ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud, - val); + ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, + val); return (pud_t) { ret }; } @@ -1321,11 +1244,11 @@ static inline pudval_t pud_val(pud_t pud) pudval_t ret; if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val, - pud.pud, (u64)pud.pud >> 32); + ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, + pud.pud, (u64)pud.pud >> 32); else - ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val, - pud.pud); + ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, + pud.pud); return ret; } @@ -1466,9 +1389,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, } void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - #define paravirt_nop ((void *)_paravirt_nop) void paravirt_use_bytelocks(void); @@ -1520,37 +1440,12 @@ extern struct paravirt_patch_site __parainstructions[], __parainstructions_end[]; #ifdef CONFIG_X86_32 -#define PV_SAVE_REGS "pushl %ecx; pushl %edx;" -#define PV_RESTORE_REGS "popl %edx; popl %ecx;" - -/* save and restore all caller-save registers, except return value */ -#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" -#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" - +#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" +#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" #define PV_FLAGS_ARG "0" #define PV_EXTRA_CLOBBERS #define PV_VEXTRA_CLOBBERS #else -/* save and restore all caller-save registers, except return value */ -#define PV_SAVE_ALL_CALLER_REGS \ - "push %rcx;" \ - "push %rdx;" \ - "push %rsi;" \ - "push %rdi;" \ - "push %r8;" \ - "push %r9;" \ - "push %r10;" \ - "push %r11;" -#define PV_RESTORE_ALL_CALLER_REGS \ - "pop %r11;" \ - "pop %r10;" \ - "pop %r9;" \ - "pop %r8;" \ - "pop %rdi;" \ - "pop %rsi;" \ - "pop %rdx;" \ - "pop %rcx;" - /* We save some registers, but all of them, that's too much. We clobber all * caller saved registers but the argument parameter */ #define PV_SAVE_REGS "pushq %%rdi;" @@ -1560,76 +1455,52 @@ extern struct paravirt_patch_site __parainstructions[], #define PV_FLAGS_ARG "D" #endif -/* - * Generate a thunk around a function which saves all caller-save - * registers except for the return value. This allows C functions to - * be called from assembler code where fewer than normal registers are - * available. It may also help code generation around calls from C - * code if the common case doesn't use many registers. - * - * When a callee is wrapped in a thunk, the caller can assume that all - * arg regs and all scratch registers are preserved across the - * call. The return value in rax/eax will not be saved, even for void - * functions. - */ -#define PV_CALLEE_SAVE_REGS_THUNK(func) \ - extern typeof(func) __raw_callee_save_##func; \ - static void *__##func##__ __used = func; \ - \ - asm(".pushsection .text;" \ - "__raw_callee_save_" #func ": " \ - PV_SAVE_ALL_CALLER_REGS \ - "call " #func ";" \ - PV_RESTORE_ALL_CALLER_REGS \ - "ret;" \ - ".popsection") - -/* Get a reference to a callee-save function */ -#define PV_CALLEE_SAVE(func) \ - ((struct paravirt_callee_save) { __raw_callee_save_##func }) - -/* Promise that "func" already uses the right calling convention */ -#define __PV_IS_CALLEE_SAVE(func) \ - ((struct paravirt_callee_save) { func }) - static inline unsigned long __raw_local_save_flags(void) { unsigned long f; - asm volatile(paravirt_alt(PARAVIRT_CALL) + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) : "=a"(f) : paravirt_type(pv_irq_ops.save_fl), paravirt_clobber(CLBR_EAX) - : "memory", "cc"); + : "memory", "cc" PV_VEXTRA_CLOBBERS); return f; } static inline void raw_local_irq_restore(unsigned long f) { - asm volatile(paravirt_alt(PARAVIRT_CALL) + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) : "=a"(f) : PV_FLAGS_ARG(f), paravirt_type(pv_irq_ops.restore_fl), paravirt_clobber(CLBR_EAX) - : "memory", "cc"); + : "memory", "cc" PV_EXTRA_CLOBBERS); } static inline void raw_local_irq_disable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) : : paravirt_type(pv_irq_ops.irq_disable), paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); } static inline void raw_local_irq_enable(void) { - asm volatile(paravirt_alt(PARAVIRT_CALL) + asm volatile(paravirt_alt(PV_SAVE_REGS + PARAVIRT_CALL + PV_RESTORE_REGS) : : paravirt_type(pv_irq_ops.irq_enable), paravirt_clobber(CLBR_EAX) - : "memory", "eax", "cc"); + : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); } static inline unsigned long __raw_local_irq_save(void) @@ -1672,49 +1543,33 @@ static inline unsigned long __raw_local_irq_save(void) .popsection -#define COND_PUSH(set, mask, reg) \ - .if ((~(set)) & mask); push %reg; .endif -#define COND_POP(set, mask, reg) \ - .if ((~(set)) & mask); pop %reg; .endif - #ifdef CONFIG_X86_64 - -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_RAX, rax); \ - COND_PUSH(set, CLBR_RCX, rcx); \ - COND_PUSH(set, CLBR_RDX, rdx); \ - COND_PUSH(set, CLBR_RSI, rsi); \ - COND_PUSH(set, CLBR_RDI, rdi); \ - COND_PUSH(set, CLBR_R8, r8); \ - COND_PUSH(set, CLBR_R9, r9); \ - COND_PUSH(set, CLBR_R10, r10); \ - COND_PUSH(set, CLBR_R11, r11) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_R11, r11); \ - COND_POP(set, CLBR_R10, r10); \ - COND_POP(set, CLBR_R9, r9); \ - COND_POP(set, CLBR_R8, r8); \ - COND_POP(set, CLBR_RDI, rdi); \ - COND_POP(set, CLBR_RSI, rsi); \ - COND_POP(set, CLBR_RDX, rdx); \ - COND_POP(set, CLBR_RCX, rcx); \ - COND_POP(set, CLBR_RAX, rax) - +#define PV_SAVE_REGS \ + push %rax; \ + push %rcx; \ + push %rdx; \ + push %rsi; \ + push %rdi; \ + push %r8; \ + push %r9; \ + push %r10; \ + push %r11 +#define PV_RESTORE_REGS \ + pop %r11; \ + pop %r10; \ + pop %r9; \ + pop %r8; \ + pop %rdi; \ + pop %rsi; \ + pop %rdx; \ + pop %rcx; \ + pop %rax #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) #else -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_EAX, eax); \ - COND_PUSH(set, CLBR_EDI, edi); \ - COND_PUSH(set, CLBR_ECX, ecx); \ - COND_PUSH(set, CLBR_EDX, edx) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_EDX, edx); \ - COND_POP(set, CLBR_ECX, ecx); \ - COND_POP(set, CLBR_EDI, edi); \ - COND_POP(set, CLBR_EAX, eax) - +#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx +#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) #define PARA_INDIRECT(addr) *%cs:addr @@ -1726,15 +1581,15 @@ static inline unsigned long __raw_local_irq_save(void) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + PV_SAVE_REGS; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) + PV_RESTORE_REGS;) \ #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + PV_SAVE_REGS; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) + PV_RESTORE_REGS;) #define USERGS_SYSRET32 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ @@ -1764,15 +1619,11 @@ static inline unsigned long __raw_local_irq_save(void) PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ swapgs) -/* - * Note: swapgs is very special, and in practise is either going to be - * implemented with a single "swapgs" instruction or something very - * special. Either way, we don't need to save any registers for - * it. - */ #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ + PV_SAVE_REGS; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ + PV_RESTORE_REGS \ ) #define GET_CR2_INTO_RCX \ diff --git a/trunk/arch/x86/include/asm/pgtable.h b/trunk/arch/x86/include/asm/pgtable.h index 6ceaef08486f..06bbcbd66e9c 100644 --- a/trunk/arch/x86/include/asm/pgtable.h +++ b/trunk/arch/x86/include/asm/pgtable.h @@ -240,78 +240,64 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } -static inline pte_t pte_set_flags(pte_t pte, pteval_t set) -{ - pteval_t v = native_pte_val(pte); - - return native_make_pte(v | set); -} - -static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) -{ - pteval_t v = native_pte_val(pte); - - return native_make_pte(v & ~clear); -} - static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return __pte(pte_val(pte) & ~_PAGE_DIRTY); } static inline pte_t pte_mkold(pte_t pte) { - return pte_clear_flags(pte, _PAGE_ACCESSED); + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); } static inline pte_t pte_wrprotect(pte_t pte) { - return pte_clear_flags(pte, _PAGE_RW); + return __pte(pte_val(pte) & ~_PAGE_RW); } static inline pte_t pte_mkexec(pte_t pte) { - return pte_clear_flags(pte, _PAGE_NX); + return __pte(pte_val(pte) & ~_PAGE_NX); } static inline pte_t pte_mkdirty(pte_t pte) { - return pte_set_flags(pte, _PAGE_DIRTY); + return __pte(pte_val(pte) | _PAGE_DIRTY); } static inline pte_t pte_mkyoung(pte_t pte) { - return pte_set_flags(pte, _PAGE_ACCESSED); + return __pte(pte_val(pte) | _PAGE_ACCESSED); } static inline pte_t pte_mkwrite(pte_t pte) { - return pte_set_flags(pte, _PAGE_RW); + return __pte(pte_val(pte) | _PAGE_RW); } static inline pte_t pte_mkhuge(pte_t pte) { - return pte_set_flags(pte, _PAGE_PSE); + return __pte(pte_val(pte) | _PAGE_PSE); } static inline pte_t pte_clrhuge(pte_t pte) { - return pte_clear_flags(pte, _PAGE_PSE); + return __pte(pte_val(pte) & ~_PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte) { - return pte_set_flags(pte, _PAGE_GLOBAL); + return __pte(pte_val(pte) | _PAGE_GLOBAL); } static inline pte_t pte_clrglobal(pte_t pte) { - return pte_clear_flags(pte, _PAGE_GLOBAL); + return __pte(pte_val(pte) & ~_PAGE_GLOBAL); } static inline pte_t pte_mkspecial(pte_t pte) { - return pte_set_flags(pte, _PAGE_SPECIAL); + return __pte(pte_val(pte) | _PAGE_SPECIAL); } extern pteval_t __supported_pte_mask; diff --git a/trunk/arch/x86/kernel/entry_64.S b/trunk/arch/x86/kernel/entry_64.S index e4c9710cae52..a52703864a16 100644 --- a/trunk/arch/x86/kernel/entry_64.S +++ b/trunk/arch/x86/kernel/entry_64.S @@ -1140,7 +1140,7 @@ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) SWAPGS gs_change: movl %edi,%gs diff --git a/trunk/arch/x86/kernel/paravirt.c b/trunk/arch/x86/kernel/paravirt.c index cea11c8e3049..e4c8fb608873 100644 --- a/trunk/arch/x86/kernel/paravirt.c +++ b/trunk/arch/x86/kernel/paravirt.c @@ -44,17 +44,6 @@ void _paravirt_nop(void) { } -/* identity function, which can be inlined */ -u32 _paravirt_ident_32(u32 x) -{ - return x; -} - -u64 _paravirt_ident_64(u64 x) -{ - return x; -} - static void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", @@ -149,16 +138,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); - else if (opfunc == _paravirt_nop) + else if (opfunc == paravirt_nop) /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); - - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_32) - ret = paravirt_patch_ident_32(insnbuf, len); - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insnbuf, len); - else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || @@ -310,10 +292,10 @@ struct pv_time_ops pv_time_ops = { struct pv_irq_ops pv_irq_ops = { .init_IRQ = native_init_IRQ, - .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), - .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), - .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), - .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, .safe_halt = native_safe_halt, .halt = native_halt, #ifdef CONFIG_X86_64 @@ -391,14 +373,6 @@ struct pv_apic_ops pv_apic_ops = { #endif }; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) -/* 32-bit pagetable entries */ -#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) -#else -/* 64-bit pagetable entries */ -#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) -#endif - struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, @@ -450,23 +424,22 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - - .pmd_val = PTE_IDENT, - .make_pmd = PTE_IDENT, + .pmd_val = native_pmd_val, + .make_pmd = native_make_pmd, #if PAGETABLE_LEVELS == 4 - .pud_val = PTE_IDENT, - .make_pud = PTE_IDENT, - + .pud_val = native_pud_val, + .make_pud = native_make_pud, .set_pgd = native_set_pgd, #endif #endif /* PAGETABLE_LEVELS >= 3 */ - .pte_val = PTE_IDENT, - .pgd_val = PTE_IDENT, + .pte_val = native_pte_val, + .pte_flags = native_pte_flags, + .pgd_val = native_pgd_val, - .make_pte = PTE_IDENT, - .make_pgd = PTE_IDENT, + .make_pte = native_make_pte, + .make_pgd = native_make_pgd, .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, diff --git a/trunk/arch/x86/kernel/paravirt_patch_32.c b/trunk/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..9fe644f4861d 100644 --- a/trunk/arch/x86/kernel/paravirt_patch_32.c +++ b/trunk/arch/x86/kernel/paravirt_patch_32.c @@ -12,18 +12,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) -{ - /* arg in %eax, return in %eax */ - return 0; -} - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - /* arg in %edx:%eax, return in %edx:%eax */ - return 0; -} - unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { diff --git a/trunk/arch/x86/kernel/paravirt_patch_64.c b/trunk/arch/x86/kernel/paravirt_patch_64.c index 3f08f34f93eb..061d01df9ae6 100644 --- a/trunk/arch/x86/kernel/paravirt_patch_64.c +++ b/trunk/arch/x86/kernel/paravirt_patch_64.c @@ -19,21 +19,6 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); -DEF_NATIVE(, mov32, "mov %edi, %eax"); -DEF_NATIVE(, mov64, "mov %rdi, %rax"); - -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) -{ - return paravirt_patch_insns(insnbuf, len, - start__mov32, end__mov32); -} - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - return paravirt_patch_insns(insnbuf, len, - start__mov64, end__mov64); -} - unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { diff --git a/trunk/arch/x86/kernel/vmi_32.c b/trunk/arch/x86/kernel/vmi_32.c index eb9e7347928e..1d3302cc2ddf 100644 --- a/trunk/arch/x86/kernel/vmi_32.c +++ b/trunk/arch/x86/kernel/vmi_32.c @@ -670,11 +670,10 @@ static inline int __init activate_vmi(void) para_fill(pv_mmu_ops.write_cr2, SetCR2); para_fill(pv_mmu_ops.write_cr3, SetCR3); para_fill(pv_cpu_ops.write_cr4, SetCR4); - - para_fill(pv_irq_ops.save_fl.func, GetInterruptMask); - para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask); - para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts); - para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts); + para_fill(pv_irq_ops.save_fl, GetInterruptMask); + para_fill(pv_irq_ops.restore_fl, SetInterruptMask); + para_fill(pv_irq_ops.irq_disable, DisableInterrupts); + para_fill(pv_irq_ops.irq_enable, EnableInterrupts); para_fill(pv_cpu_ops.wbinvd, WBINVD); para_fill(pv_cpu_ops.read_tsc, RDTSC); diff --git a/trunk/arch/x86/kernel/vsmp_64.c b/trunk/arch/x86/kernel/vsmp_64.c index c609205df594..a688f3bfaec2 100644 --- a/trunk/arch/x86/kernel/vsmp_64.c +++ b/trunk/arch/x86/kernel/vsmp_64.c @@ -37,7 +37,6 @@ static unsigned long vsmp_save_fl(void) flags &= ~X86_EFLAGS_IF; return flags; } -PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); static void vsmp_restore_fl(unsigned long flags) { @@ -47,7 +46,6 @@ static void vsmp_restore_fl(unsigned long flags) flags |= X86_EFLAGS_AC; native_restore_fl(flags); } -PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); static void vsmp_irq_disable(void) { @@ -55,7 +53,6 @@ static void vsmp_irq_disable(void) native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); } -PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); static void vsmp_irq_enable(void) { @@ -63,7 +60,6 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -94,10 +90,10 @@ static void __init set_vsmp_pv_ops(void) cap, ctl); if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); - pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); + pv_irq_ops.irq_disable = vsmp_irq_disable; + pv_irq_ops.irq_enable = vsmp_irq_enable; + pv_irq_ops.save_fl = vsmp_save_fl; + pv_irq_ops.restore_fl = vsmp_restore_fl; pv_init_ops.patch = vsmp_patch; ctl &= ~(1 << 4); diff --git a/trunk/arch/x86/lguest/boot.c b/trunk/arch/x86/lguest/boot.c index 19e33b6cd593..92f1c6f3e19d 100644 --- a/trunk/arch/x86/lguest/boot.c +++ b/trunk/arch/x86/lguest/boot.c @@ -173,29 +173,24 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); /* restore_flags() just sets the flags back to the value given. */ static void restore_fl(unsigned long flags) { lguest_data.irq_enabled = flags; } -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } -PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /* Interrupts go on... */ static void irq_enable(void) { lguest_data.irq_enabled = X86_EFLAGS_IF; } -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); - /*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, @@ -989,10 +984,10 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; - pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); - pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.save_fl = save_fl; + pv_irq_ops.restore_fl = restore_fl; + pv_irq_ops.irq_disable = irq_disable; + pv_irq_ops.irq_enable = irq_enable; pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ diff --git a/trunk/arch/x86/xen/enlighten.c b/trunk/arch/x86/xen/enlighten.c index ff6d530ccc77..bef941f61451 100644 --- a/trunk/arch/x86/xen/enlighten.c +++ b/trunk/arch/x86/xen/enlighten.c @@ -61,6 +61,35 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -208,7 +237,7 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -void xen_leave_lazy(void) +static void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); @@ -569,6 +598,76 @@ static struct apic_ops xen_basic_apic_ops = { #endif +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long va) +{ + struct { + struct mmuext_op op; + DECLARE_BITMAP(mask, NR_CPUS); + } *args; + struct multicall_space mcs; + + BUG_ON(cpumask_empty(cpus)); + BUG_ON(!mm); + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + if (unlikely(cpumask_empty(to_cpumask(args->mask)))) + goto issue; + + if (va == TLB_FLUSH_ALL) { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + } else { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + +issue: + xen_mc_issue(PARAVIRT_LAZY_MMU); +} static void xen_clts(void) { @@ -594,6 +693,21 @@ static void xen_write_cr0(unsigned long cr0) xen_mc_issue(PARAVIRT_LAZY_CPU); } +static void xen_write_cr2(unsigned long cr2) +{ + percpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return percpu_read(xen_vcpu)->arch.cr2; +} + +static unsigned long xen_read_cr2_direct(void) +{ + return percpu_read(xen_vcpu_info.arch.cr2); +} + static void xen_write_cr4(unsigned long cr4) { cr4 &= ~X86_CR4_PGE; @@ -602,6 +716,71 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } +static unsigned long xen_read_cr3(void) +{ + return percpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + percpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn; + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (kernel) { + percpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + percpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -643,6 +822,185 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void xen_release_pte_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + vm_unmap_aliases(); + if (!PageHighMem(page)) { + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + +/* This should never happen until we're OK to use struct page */ +static void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) { + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +{ + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); +} +#endif + +#ifdef CONFIG_X86_32 +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} +#endif + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -663,6 +1021,37 @@ void xen_setup_shared_info(void) xen_setup_mfn_list_list(); } +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + xen_setup_shared_info(); +} + +static __init void xen_post_allocator_init(void) +{ + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + /* This is called once we have the cpu_possible_map */ void xen_setup_vcpu_info_placement(void) { @@ -676,10 +1065,10 @@ void xen_setup_vcpu_info_placement(void) if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); - pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); + pv_irq_ops.save_fl = xen_save_fl_direct; + pv_irq_ops.restore_fl = xen_restore_fl_direct; + pv_irq_ops.irq_disable = xen_irq_disable_direct; + pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } } @@ -737,6 +1126,49 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } +static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: + case FIX_VDSO: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ +#endif + pte = pfn_pte(phys, prot); + break; + + default: + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -832,6 +1264,87 @@ static const struct pv_apic_ops xen_apic_ops __initdata = { #endif }; +static const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif + +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else + .set_pte = xen_set_pte_init, +#endif + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = xen_pte_val, + .pte_flags = native_pte_flags, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, + +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, + + .set_fixmap = xen_set_fixmap, +}; + static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -874,6 +1387,223 @@ static const struct machine_ops __initdata xen_machine_ops = { }; +static void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Set up identity map */ + xen_map_identity_early(level2_ident_pgt, max_pfn); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); + + return pgd; +} +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; +} +#endif /* CONFIG_X86_64 */ + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { diff --git a/trunk/arch/x86/xen/irq.c b/trunk/arch/x86/xen/irq.c index 5a070900ad35..2e8271431e1a 100644 --- a/trunk/arch/x86/xen/irq.c +++ b/trunk/arch/x86/xen/irq.c @@ -50,7 +50,6 @@ static unsigned long xen_save_fl(void) */ return (-flags) & X86_EFLAGS_IF; } -PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl); static void xen_restore_fl(unsigned long flags) { @@ -77,7 +76,6 @@ static void xen_restore_fl(unsigned long flags) xen_force_evtchn_callback(); } } -PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); static void xen_irq_disable(void) { @@ -88,7 +86,6 @@ static void xen_irq_disable(void) percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); static void xen_irq_enable(void) { @@ -109,7 +106,6 @@ static void xen_irq_enable(void) if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); } -PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); static void xen_safe_halt(void) { @@ -128,12 +124,10 @@ static void xen_halt(void) static const struct pv_irq_ops xen_irq_ops __initdata = { .init_IRQ = __xen_init_IRQ, - - .save_fl = PV_CALLEE_SAVE(xen_save_fl), - .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), - .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), - .irq_enable = PV_CALLEE_SAVE(xen_irq_enable), - + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, .safe_halt = xen_safe_halt, .halt = xen_halt, #ifdef CONFIG_X86_64 diff --git a/trunk/arch/x86/xen/mmu.c b/trunk/arch/x86/xen/mmu.c index d2e8ed1aff3d..98cb9869eb24 100644 --- a/trunk/arch/x86/xen/mmu.c +++ b/trunk/arch/x86/xen/mmu.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -56,8 +55,6 @@ #include #include -#include -#include #include "multicalls.h" #include "mmu.h" @@ -117,37 +114,6 @@ static inline void check_zero(void) #endif /* CONFIG_XEN_DEBUG_FS */ - -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; - -#ifdef CONFIG_X86_64 -/* l3 pud for userspace vsyscall mapping */ -static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; -#endif /* CONFIG_X86_64 */ - -/* - * Note about cr3 (pagetable base) values: - * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. - * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. - */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ - - /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -492,33 +458,28 @@ pteval_t xen_pte_val(pte_t pte) { return pte_mfn_to_pfn(pte.pte); } -PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } -PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } -PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); void xen_set_pud_hyper(pud_t *ptr, pud_t val) { @@ -595,14 +556,12 @@ pmd_t xen_make_pmd(pmdval_t pmd) pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } -PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); pud_t xen_make_pud(pudval_t pud) { @@ -610,7 +569,6 @@ pud_t xen_make_pud(pudval_t pud) return native_make_pud(pud); } -PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); pgd_t *xen_get_user_pgd(pgd_t *pgd) { @@ -1194,709 +1152,6 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static __init void xen_pagetable_setup_start(pgd_t *base) -{ -} - -static __init void xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); -} - -static void xen_write_cr2(unsigned long cr2) -{ - percpu_read(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return percpu_read(xen_vcpu)->arch.cr2; -} - -unsigned long xen_read_cr2_direct(void) -{ - return percpu_read(xen_vcpu_info.arch.cr2); -} - -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) -{ - struct { - struct mmuext_op op; - DECLARE_BITMAP(mask, NR_CPUS); - } *args; - struct multicall_space mcs; - - BUG_ON(cpumask_empty(cpus)); - BUG_ON(!mm); - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->op.arg2.vcpumask = to_cpumask(args->mask); - - /* Remove us, and any offline CPUS. */ - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (unlikely(cpumask_empty(to_cpumask(args->mask)))) - goto issue; - - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - -issue: - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static unsigned long xen_read_cr3(void) -{ - return percpu_read(xen_cr3); -} - -static void set_current_cr3(void *v) -{ - percpu_write(xen_current_cr3, (unsigned long)v); -} - -static void __xen_write_cr3(bool kernel, unsigned long cr3) -{ - struct mmuext_op *op; - struct multicall_space mcs; - unsigned long mfn; - - if (cr3) - mfn = pfn_to_mfn(PFN_DOWN(cr3)); - else - mfn = 0; - - WARN_ON(mfn == 0 && kernel); - - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - if (kernel) { - percpu_write(xen_cr3, cr3); - - /* Update xen_current_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - } -} - -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - percpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); - if (user_pgd) - __xen_write_cr3(false, __pa(user_pgd)); - else - __xen_write_cr3(false, 0); - } -#endif - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - -static int xen_pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = mm->pgd; - int ret = 0; - - BUG_ON(PagePinned(virt_to_page(pgd))); - -#ifdef CONFIG_X86_64 - { - struct page *page = virt_to_page(pgd); - pgd_t *user_pgd; - - BUG_ON(page->private != 0); - - ret = -ENOMEM; - - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - page->private = (unsigned long)user_pgd; - - if (user_pgd != NULL) { - user_pgd[pgd_index(VSYSCALL_START)] = - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); - ret = 0; - } - - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); - } -#endif - - return ret; -} - -static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) - free_page((unsigned long)user_pgd); -#endif -} - -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - -#ifdef CONFIG_X86_32 -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; -} - -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - pte = mask_rw_pte(ptep, pte); - - xen_set_pte(ptep, pte); -} -#endif - -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* Early release_pte assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(unsigned long pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(virt_to_page(mm->pgd))) { - SetPagePinned(page); - - vm_unmap_aliases(); - if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - } else { - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } - } -} - -static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); -} - -static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PMD); -} - -/* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(page)) { - if (!PageHighMem(page)) { - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - } - ClearPagePinned(page); - } -} - -static void xen_release_pte(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} - -static void xen_release_pmd(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PMD); -} - -#if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PUD); -} - -static void xen_release_pud(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PUD); -} -#endif - -void __init xen_reserve_top(void) -{ -#ifdef CONFIG_X86_32 - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; - - reserve_top_address(-top); -#endif /* CONFIG_X86_32 */ -} - -/* - * Like __va(), but returns address in the kernel mapping (which is - * all we have until the physical memory mapping has been set up. - */ -static void *__ka(phys_addr_t paddr) -{ -#ifdef CONFIG_X86_64 - return (void *)(paddr + __START_KERNEL_map); -#else - return __va(paddr); -#endif -} - -/* Convert a machine address to physical address */ -static unsigned long m2p(phys_addr_t maddr) -{ - phys_addr_t paddr; - - maddr &= PTE_PFN_MASK; - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; - - return paddr; -} - -/* Convert a machine address to kernel virtual */ -static void *m2v(phys_addr_t maddr) -{ - return __ka(m2p(maddr)); -} - -static void set_page_prot(void *addr, pgprot_t prot) -{ - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; - pte_t pte = pfn_pte(pfn, prot); - - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) - BUG(); -} - -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) -{ - unsigned pmdidx, pteidx; - unsigned ident_pte; - unsigned long pfn; - - ident_pte = 0; - pfn = 0; - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { - pte_t *pte_page; - - /* Reuse or allocate a page of ptes */ - if (pmd_present(pmd[pmdidx])) - pte_page = m2v(pmd[pmdidx].pmd); - else { - /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) - break; - - pte_page = &level1_ident_pgt[ident_pte]; - ident_pte += PTRS_PER_PTE; - - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - } - - /* Install mappings */ - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { - pte_t pte; - - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - - if (!pte_none(pte_page[pteidx])) - continue; - - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); - pte_page[pteidx] = pte; - } - } - - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); - - set_page_prot(pmd, PAGE_KERNEL_RO); -} - -#ifdef CONFIG_X86_64 -static void convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} - -/* - * Set up the inital kernel pagetable. - * - * We can construct this by grafting the Xen provided pagetable into - * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This - * means that only the kernel has a physical mapping to start with - - * but that's enough to get __va working. We need to fill in the rest - * of the physical mapping once some sort of allocator has been set - * up. - */ -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pud_t *l3; - pmd_t *l2; - - /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); - - /* Pre-constructed entries are in pfn, so convert to mfn */ - convert_pfn_mfn(init_level4_pgt); - convert_pfn_mfn(level3_ident_pgt); - convert_pfn_mfn(level3_kernel_pgt); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); - - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* Switch over */ - pgd = init_level4_pgt; - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - - reserve_early(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); - - return pgd; -} -#else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; - -__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pmd_t *kernel_pmd; - - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); - - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - - xen_map_identity_early(level2_kernel_pgt, max_pfn); - - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); - - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - - return swapper_pg_dir; -} -#endif /* CONFIG_X86_64 */ - -static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) -{ - pte_t pte; - - phys >>= PAGE_SHIFT; - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -#ifdef CONFIG_X86_F00F_BUG - case FIX_F00F_IDT: -#endif -#ifdef CONFIG_X86_32 - case FIX_WP_TEST: - case FIX_VDSO: -# ifdef CONFIG_HIGHMEM - case FIX_KMAP_BEGIN ... FIX_KMAP_END: -# endif -#else - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: -#endif -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ -#endif - pte = pfn_pte(phys, prot); - break; - - default: - pte = mfn_pte(phys, prot); - break; - } - - __native_set_fixmap(idx, pte); - -#ifdef CONFIG_X86_64 - /* Replicate changes to map the vsyscall page into the user - pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } -#endif -} - -__init void xen_post_allocator_init(void) -{ - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; -#endif - - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - -#ifdef CONFIG_X86_64 - SetPagePinned(virt_to_page(level3_user_vsyscall)); -#endif - xen_mark_init_mm_pinned(); -} - - -const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pte_init, - .alloc_pmd_clone = paravirt_nop, - .release_pmd = xen_release_pte_init, - -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - -#ifdef CONFIG_X86_64 - .set_pte = xen_set_pte, -#else - .set_pte = xen_set_pte_init, -#endif - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, - - .pte_val = PV_CALLEE_SAVE(xen_pte_val), - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - - .make_pte = PV_CALLEE_SAVE(xen_make_pte), - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .set_pte_present = xen_set_pte_at, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, -#endif /* CONFIG_X86_PAE */ - .set_pud = xen_set_pud_hyper, - - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), - -#if PAGETABLE_LEVELS == 4 - .pud_val = PV_CALLEE_SAVE(xen_pud_val), - .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_pgd = xen_set_pgd_hyper, - - .alloc_pud = xen_alloc_pte_init, - .release_pud = xen_release_pte_init, -#endif /* PAGETABLE_LEVELS == 4 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, - }, - - .set_fixmap = xen_set_fixmap, -}; - - #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/trunk/arch/x86/xen/mmu.h b/trunk/arch/x86/xen/mmu.h index 24d1b44a337d..98d71659da5a 100644 --- a/trunk/arch/x86/xen/mmu.h +++ b/trunk/arch/x86/xen/mmu.h @@ -54,7 +54,4 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -unsigned long xen_read_cr2_direct(void); - -extern const struct pv_mmu_ops xen_mmu_ops; #endif /* _XEN_MMU_H */ diff --git a/trunk/arch/x86/xen/xen-ops.h b/trunk/arch/x86/xen/xen-ops.h index 11913fc94c14..c1f8faf0a2c5 100644 --- a/trunk/arch/x86/xen/xen-ops.h +++ b/trunk/arch/x86/xen/xen-ops.h @@ -13,7 +13,6 @@ extern const char xen_failsafe_callback[]; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -23,13 +22,6 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); -void xen_setup_machphys_mapping(void); -pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); -void xen_ident_map_ISA(void); -void xen_reserve_top(void); - -void xen_leave_lazy(void); -void xen_post_allocator_init(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); diff --git a/trunk/include/asm-generic/vmlinux.lds.h b/trunk/include/asm-generic/vmlinux.lds.h index 53e21f36a802..5406e70aba86 100644 --- a/trunk/include/asm-generic/vmlinux.lds.h +++ b/trunk/include/asm-generic/vmlinux.lds.h @@ -445,10 +445,9 @@ * section in the linker script will go there too. @phdr should have * a leading colon. * - * This macro defines three symbols, __per_cpu_load, __per_cpu_start - * and __per_cpu_end. The first one is the vaddr of loaded percpu - * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the - * end offset. + * Note that this macros defines __per_cpu_load as an absolute symbol. + * If there is no need to put the percpu section at a predetermined + * address, use PERCPU(). */ #define PERCPU_VADDR(vaddr, phdr) \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ @@ -470,7 +469,20 @@ * Align to @align and outputs output section for percpu area. This * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and * __per_cpu_start will be identical. + * + * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except + * that __per_cpu_load is defined as a relative symbol against + * .data.percpu which is required for relocatable x86_32 + * configuration. */ #define PERCPU(align) \ . = ALIGN(align); \ - PERCPU_VADDR( , ) + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__per_cpu_load) = .; \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ + *(.data.percpu.first) \ + *(.data.percpu.page_aligned) \ + *(.data.percpu) \ + *(.data.percpu.shared_aligned) \ + VMLINUX_SYMBOL(__per_cpu_end) = .; \ + }