From 88ec6b93c8e7d6d4ffaf6ad6395ceb3bf552de15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 10 Apr 2019 19:04:33 +0200 Subject: [PATCH 001/205] powerpc/xive: add OPAL extensions for the XIVE native exploitation support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The support for XIVE native exploitation mode in Linux/KVM needs a couple more OPAL calls to get and set the state of the XIVE internal structures being used by a sPAPR guest. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 7 +- arch/powerpc/include/asm/opal.h | 7 ++ arch/powerpc/include/asm/xive.h | 14 +++ arch/powerpc/platforms/powernv/opal-call.c | 3 + arch/powerpc/sysdev/xive/native.c | 99 ++++++++++++++++++++++ 5 files changed, 127 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 870fb7b239ea4..e1d118ac61dc8 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -186,8 +186,8 @@ #define OPAL_XIVE_FREE_IRQ 140 #define OPAL_XIVE_SYNC 141 #define OPAL_XIVE_DUMP 142 -#define OPAL_XIVE_RESERVED3 143 -#define OPAL_XIVE_RESERVED4 144 +#define OPAL_XIVE_GET_QUEUE_STATE 143 +#define OPAL_XIVE_SET_QUEUE_STATE 144 #define OPAL_SIGNAL_SYSTEM_RESET 145 #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 @@ -210,7 +210,8 @@ #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 #define OPAL_NX_COPROC_INIT 167 -#define OPAL_LAST 167 +#define OPAL_XIVE_GET_VP_STATE 170 +#define OPAL_LAST 170 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index a55b01c90bb13..4e978d4dea5ce 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -279,6 +279,13 @@ int64_t opal_xive_allocate_irq(uint32_t chip_id); int64_t opal_xive_free_irq(uint32_t girq); int64_t opal_xive_sync(uint32_t type, uint32_t id); int64_t opal_xive_dump(uint32_t type, uint32_t id); +int64_t opal_xive_get_queue_state(uint64_t vp, uint32_t prio, + __be32 *out_qtoggle, + __be32 *out_qindex); +int64_t opal_xive_set_queue_state(uint64_t vp, uint32_t prio, + uint32_t qtoggle, + uint32_t qindex); +int64_t opal_xive_get_vp_state(uint64_t vp, __be64 *out_w01); int64_t opal_pci_set_p2p(uint64_t phb_init, uint64_t phb_target, uint64_t desc, uint16_t pe_number); diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 3c704f5dd3ae3..b579a943407bb 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -109,12 +109,26 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); extern void xive_native_sync_source(u32 hw_irq); +extern void xive_native_sync_queue(u32 hw_irq); extern bool is_xive_irq(struct irq_chip *chip); extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); extern int xive_native_disable_vp(u32 vp_id); extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); extern bool xive_native_has_single_escalation(void); +extern int xive_native_get_queue_info(u32 vp_id, uint32_t prio, + u64 *out_qpage, + u64 *out_qsize, + u64 *out_qeoi_page, + u32 *out_escalate_irq, + u64 *out_qflags); + +extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, + u32 *qindex); +extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, + u32 qindex); +extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); + #else static inline bool xive_enabled(void) { return false; } diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index daad8c45c8e72..7472244e7f301 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -260,6 +260,9 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE); +OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE); +OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE); OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 1ca127d052a67..0c037e933e559 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -437,6 +437,12 @@ void xive_native_sync_source(u32 hw_irq) } EXPORT_SYMBOL_GPL(xive_native_sync_source); +void xive_native_sync_queue(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_QUEUE, hw_irq); +} +EXPORT_SYMBOL_GPL(xive_native_sync_queue); + static const struct xive_ops xive_native_ops = { .populate_irq_data = xive_native_populate_irq_data, .configure_irq = xive_native_configure_irq, @@ -711,3 +717,96 @@ bool xive_native_has_single_escalation(void) return xive_has_single_esc; } EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); + +int xive_native_get_queue_info(u32 vp_id, u32 prio, + u64 *out_qpage, + u64 *out_qsize, + u64 *out_qeoi_page, + u32 *out_escalate_irq, + u64 *out_qflags) +{ + __be64 qpage; + __be64 qsize; + __be64 qeoi_page; + __be32 escalate_irq; + __be64 qflags; + s64 rc; + + rc = opal_xive_get_queue_info(vp_id, prio, &qpage, &qsize, + &qeoi_page, &escalate_irq, &qflags); + if (rc) { + pr_err("OPAL failed to get queue info for VCPU %d/%d : %lld\n", + vp_id, prio, rc); + return -EIO; + } + + if (out_qpage) + *out_qpage = be64_to_cpu(qpage); + if (out_qsize) + *out_qsize = be32_to_cpu(qsize); + if (out_qeoi_page) + *out_qeoi_page = be64_to_cpu(qeoi_page); + if (out_escalate_irq) + *out_escalate_irq = be32_to_cpu(escalate_irq); + if (out_qflags) + *out_qflags = be64_to_cpu(qflags); + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_queue_info); + +int xive_native_get_queue_state(u32 vp_id, u32 prio, u32 *qtoggle, u32 *qindex) +{ + __be32 opal_qtoggle; + __be32 opal_qindex; + s64 rc; + + rc = opal_xive_get_queue_state(vp_id, prio, &opal_qtoggle, + &opal_qindex); + if (rc) { + pr_err("OPAL failed to get queue state for VCPU %d/%d : %lld\n", + vp_id, prio, rc); + return -EIO; + } + + if (qtoggle) + *qtoggle = be32_to_cpu(opal_qtoggle); + if (qindex) + *qindex = be32_to_cpu(opal_qindex); + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_queue_state); + +int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) +{ + s64 rc; + + rc = opal_xive_set_queue_state(vp_id, prio, qtoggle, qindex); + if (rc) { + pr_err("OPAL failed to set queue state for VCPU %d/%d : %lld\n", + vp_id, prio, rc); + return -EIO; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_set_queue_state); + +int xive_native_get_vp_state(u32 vp_id, u64 *out_state) +{ + __be64 state; + s64 rc; + + rc = opal_xive_get_vp_state(vp_id, &state); + if (rc) { + pr_err("OPAL failed to get vp state for VCPU %d : %lld\n", + vp_id, rc); + return -EIO; + } + + if (out_state) + *out_state = be64_to_cpu(state); + return 0; +} +EXPORT_SYMBOL_GPL(xive_native_get_vp_state); From a273fa386a947612a23b0d56dcfb8823662b8606 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 7 Feb 2019 16:16:51 +1100 Subject: [PATCH 002/205] powerpc/32: Add ppc_defconfig Add a generic 32-bit defconfig called ppc_defconfig. This means we'll have a defconfig matching "uname -m" for all cases. This config is mostly intended for build testing but if someone wants to tweak it to get it booting on something that would be fine too. Signed-off-by: Michael Ellerman Tested-by: Mathieu Malaterre Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 7de49889bd5dc..a1b17bcd0b620 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -367,6 +367,10 @@ ppc32_allmodconfig: $(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/book3s_32.config \ -f $(srctree)/Makefile allmodconfig +PHONY += ppc_defconfig +ppc_defconfig: + $(call merge_into_defconfig,book3s_32.config,) + PHONY += ppc64le_allmodconfig ppc64le_allmodconfig: $(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/le.config \ From af5cd05de5dd38cf25d14ea4d30ae9b791d2420b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 7 Feb 2019 16:16:52 +1100 Subject: [PATCH 003/205] powerpc: Fix defconfig choice logic when cross compiling Our logic for choosing defconfig doesn't work well in some situations. For example if you're on a ppc64le machine but you specify a non-empty CROSS_COMPILE, in order to use a non-default toolchain, then defconfig will give you ppc64_defconfig (big endian): $ make CROSS_COMPILE=~/toolchains/gcc-8/bin/powerpc-linux- defconfig *** Default configuration is based on 'ppc64_defconfig' This is because we assume that CROSS_COMPILE being set means we can't be on a ppc machine and rather than checking we just default to ppc64_defconfig. We should just ignore CROSS_COMPILE, instead check the machine with uname and if it's one of ppc, ppc64 or ppc64le then use that defconfig. If it's none of those then we fall back to ppc64_defconfig. Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index a1b17bcd0b620..64b8a5ae3b75f 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -34,11 +34,10 @@ ifdef CONFIG_PPC_BOOK3S_32 KBUILD_CFLAGS += -mcpu=powerpc endif -ifeq ($(CROSS_COMPILE),) -KBUILD_DEFCONFIG := $(shell uname -m)_defconfig -else -KBUILD_DEFCONFIG := ppc64_defconfig -endif +# If we're on a ppc/ppc64/ppc64le machine use that defconfig, otherwise just use +# ppc64_defconfig because we have nothing better to go on. +uname := $(shell uname -m) +KBUILD_DEFCONFIG := $(if $(filter ppc%,$(uname)),$(uname),ppc64)_defconfig ifdef CONFIG_PPC64 new_nm := $(shell if $(NM) --help 2>&1 | grep -- '--synthetic' > /dev/null; then echo y; else echo n; fi) From 6c84f8c5cbfb4bf728f88296bc035c4a401c3423 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 7 Mar 2019 09:47:50 +0000 Subject: [PATCH 004/205] powerpc/highmem: Change BUG_ON() to WARN_ON() In arch/powerpc/mm/highmem.c, BUG_ON() is called only when CONFIG_DEBUG_HIGHMEM is selected, this means the BUG_ON() is not vital and can be replaced by a a WARN_ON(). At the same time, use IS_ENABLED() instead of #ifdef to clean a bit. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/highmem.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 82a0e37557a5a..320c1672b2ae0 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -43,9 +43,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif + WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); local_flush_tlb_page(NULL, vaddr); @@ -56,7 +54,6 @@ EXPORT_SYMBOL(kmap_atomic_prot); void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type __maybe_unused; if (vaddr < __fix_to_virt(FIX_KMAP_END)) { pagefault_enable(); @@ -64,14 +61,12 @@ void __kunmap_atomic(void *kvaddr) return; } - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { + int type = kmap_atomic_idx(); unsigned int idx; idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); /* * force other mappings to Oops if they'll try to access @@ -80,7 +75,6 @@ void __kunmap_atomic(void *kvaddr) pte_clear(&init_mm, vaddr, kmap_pte-idx); local_flush_tlb_page(NULL, vaddr); } -#endif kmap_atomic_idx_pop(); pagefault_enable(); From eea86aa4171d4960f0fcdc99dab358c224d53ffe Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 14 Mar 2019 23:54:53 +1100 Subject: [PATCH 005/205] powerpc/mm/64: Document the sizes of/sizes mapped by Pxx_INDEX_SIZE Add comments describing the size in bytes of the various levels of the page table tree, and the size of the virtual address space mapped by each level, to make it clear what the sizes are without having to also look up other definitions. The code that calculates the sizes actually uses sizeof(pgd_t) etc., so in theory these comments could skew vs the code, but the size of pgd_t etc. is unlikely to change very often. Signed-off-by: Michael Ellerman Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 8 ++++---- arch/powerpc/include/asm/book3s/64/hash-64k.h | 9 +++++---- arch/powerpc/include/asm/book3s/64/radix-4k.h | 9 +++++---- arch/powerpc/include/asm/book3s/64/radix-64k.h | 8 ++++---- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index cf5ba52542992..54fab723a8c74 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -2,10 +2,10 @@ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_4K_H #define _ASM_POWERPC_BOOK3S_64_HASH_4K_H -#define H_PTE_INDEX_SIZE 9 -#define H_PMD_INDEX_SIZE 7 -#define H_PUD_INDEX_SIZE 9 -#define H_PGD_INDEX_SIZE 9 +#define H_PTE_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps: 2^9 x 4KB = 2MB +#define H_PMD_INDEX_SIZE 7 // size: 8B << 7 = 1KB, maps: 2^7 x 2MB = 256MB +#define H_PUD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps: 2^9 x 256MB = 128GB +#define H_PGD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps: 2^9 x 128GB = 64TB /* * Each context is 512TB. But on 4k we restrict our max TASK size to 64TB diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f82ee8a3b5617..81f4eb6e7da4c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -2,10 +2,11 @@ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H -#define H_PTE_INDEX_SIZE 8 -#define H_PMD_INDEX_SIZE 10 -#define H_PUD_INDEX_SIZE 10 -#define H_PGD_INDEX_SIZE 8 +#define H_PTE_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 64KB = 16MB +#define H_PMD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16MB = 16GB +#define H_PUD_INDEX_SIZE 10 // size: 8B << 10 = 8KB, maps 2^10 x 16GB = 16TB +#define H_PGD_INDEX_SIZE 8 // size: 8B << 8 = 2KB, maps 2^8 x 16TB = 4PB + /* * Each context is 512TB size. SLB miss for first context/default context diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h index 863c3e8286fbf..d5f5ab73dc7ff 100644 --- a/arch/powerpc/include/asm/book3s/64/radix-4k.h +++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h @@ -5,10 +5,11 @@ /* * For 4K page size supported index is 13/9/9/9 */ -#define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */ -#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ -#define RADIX_PUD_INDEX_SIZE 9 -#define RADIX_PGD_INDEX_SIZE 13 +#define RADIX_PTE_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 4K = 2MB +#define RADIX_PMD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 2MB = 1GB +#define RADIX_PUD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 1GB = 512GB +#define RADIX_PGD_INDEX_SIZE 13 // size: 8B << 13 = 64KB, maps 2^13 x 512GB = 4PB + /* * One fragment per per page */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h index ccb78ca9d0c54..54e33828b0fb8 100644 --- a/arch/powerpc/include/asm/book3s/64/radix-64k.h +++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h @@ -5,10 +5,10 @@ /* * For 64K page size supported index is 13/9/9/5 */ -#define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */ -#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ -#define RADIX_PUD_INDEX_SIZE 9 -#define RADIX_PGD_INDEX_SIZE 13 +#define RADIX_PTE_INDEX_SIZE 5 // size: 8B << 5 = 256B, maps 2^5 x 64K = 2MB +#define RADIX_PMD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 2MB = 1GB +#define RADIX_PUD_INDEX_SIZE 9 // size: 8B << 9 = 4KB, maps 2^9 x 1GB = 512GB +#define RADIX_PGD_INDEX_SIZE 13 // size: 8B << 13 = 64KB, maps 2^13 x 512GB = 4PB /* * We use a 256 byte PTE page fragment in radix From ff6d27823f619892ab96f7461764840e0d786b15 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 22 Mar 2019 04:24:37 +0000 Subject: [PATCH 006/205] powerpc: vdso: Make vdso32 installation conditional in vdso_install The 32-bit vDSO is not needed and not normally built for 64-bit little-endian configurations. However, the vdso_install target still builds and installs it. Add the same config condition as is normally used for the build. Fixes: e0d005916994 ("powerpc/vdso: Disable building the 32-bit VDSO ...") Signed-off-by: Ben Hutchings Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 64b8a5ae3b75f..258ea6b2f2e75 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -409,7 +409,9 @@ vdso_install: ifdef CONFIG_PPC64 $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@ endif +ifdef CONFIG_VDSO32 $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso32 $@ +endif archclean: $(Q)$(MAKE) $(clean)=$(boot) From 308be6c7817c850b55c3b6ff4bf53c2427e274bc Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 26 Mar 2019 21:47:18 +0100 Subject: [PATCH 007/205] powerpc/embedded6xx: Make some functions static In commit cb9e4d10c448 ("[POWERPC] Add support for 750CL Holly board") new functions were added. Since most of these functions can be made static, make it so. Both holly_power_off and holly_halt functions were not changed since they are unused, making them static would have triggered the following warning (treated as error): arch/powerpc/platforms/embedded6xx/holly.c:244:13: error: 'holly_halt' defined but not used Silence the following warnings triggered using W=1: arch/powerpc/platforms/embedded6xx/holly.c:47:5: error: no previous prototype for 'holly_exclude_device' arch/powerpc/platforms/embedded6xx/holly.c:190:6: error: no previous prototype for 'holly_show_cpuinfo' arch/powerpc/platforms/embedded6xx/holly.c:196:17: error: no previous prototype for 'holly_restart' Signed-off-by: Mathieu Malaterre Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/embedded6xx/holly.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 0409714e8070e..9d2eefef7b7b6 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -44,7 +44,8 @@ #define HOLLY_PCI_CFG_PHYS 0x7c000000 -int holly_exclude_device(struct pci_controller *hose, u_char bus, u_char devfn) +static int holly_exclude_device(struct pci_controller *hose, u_char bus, + u_char devfn) { if (bus == 0 && PCI_SLOT(devfn) == 0) return PCIBIOS_DEVICE_NOT_FOUND; @@ -187,13 +188,13 @@ static void __init holly_init_IRQ(void) tsi108_write_reg(TSI108_MPIC_OFFSET + 0x30c, 0); } -void holly_show_cpuinfo(struct seq_file *m) +static void holly_show_cpuinfo(struct seq_file *m) { seq_printf(m, "vendor\t\t: IBM\n"); seq_printf(m, "machine\t\t: PPC750 GX/CL\n"); } -void __noreturn holly_restart(char *cmd) +static void __noreturn holly_restart(char *cmd) { __be32 __iomem *ocn_bar1 = NULL; unsigned long bar; From 62611c1e241878c538211d80cefef9de72030b56 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Tue, 26 Mar 2019 21:47:19 +0100 Subject: [PATCH 008/205] powerpc/embedded6xx: Remove unused functions holly_power_off and holly_halt Silence the following warnings triggered using W=1: arch/powerpc/platforms/embedded6xx/holly.c:236:6: error: no previous prototype for 'holly_power_off' arch/powerpc/platforms/embedded6xx/holly.c:243:6: error: no previous prototype for 'holly_halt' Signed-off-by: Mathieu Malaterre Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/embedded6xx/holly.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 9d2eefef7b7b6..829bf3697dc96 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -234,18 +234,6 @@ static void __noreturn holly_restart(char *cmd) for (;;) ; } -void holly_power_off(void) -{ - local_irq_disable(); - /* No way to shut power off with software */ - for (;;) ; -} - -void holly_halt(void) -{ - holly_power_off(); -} - /* * Called very early, device-tree isn't unflattened */ From 56c46bba9bbfe229b4472a5be313c44c5b714a39 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 27 Mar 2019 14:35:54 +1100 Subject: [PATCH 009/205] powerpc/64: Fix booting large kernels with STRICT_KERNEL_RWX With STRICT_KERNEL_RWX enabled anything marked __init is placed at a 16M boundary. This is necessary so that it can be repurposed later with different permissions. However, in kernels with text larger than 16M, this pushes early_setup past 32M, incapable of being reached by the branch instruction. Fix this by setting the CTR and branching there instead. Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some configs") Signed-off-by: Russell Currey [mpe: Fix it to work on BE by using DOTSYM()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_64.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 3fad8d499767c..5321a11c28358 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -968,7 +968,9 @@ start_here_multiplatform: /* Restore parameters passed from prom_init/kexec */ mr r3,r31 - bl early_setup /* also sets r13 and SPRG_PACA */ + LOAD_REG_ADDR(r12, DOTSYM(early_setup)) + mtctr r12 + bctrl /* also sets r13 and SPRG_PACA */ LOAD_REG_ADDR(r3, start_here_common) ld r4,PACAKMSR(r13) From c9d8dda42372dce00ac3a1c653bef7b8d2dbe3ce Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Thu, 28 Mar 2019 17:10:33 +0530 Subject: [PATCH 010/205] powerpc/pseries/mce: Improve array initialization. This is a follow up to the patch that fixed misleading print for TLB mutlihit due to wrongly populated mc_err_types[] array. Convert all the static array initialization to '[x] = val' style for better readability of array indexing and avoid any further confusion. Suggested-by: Michael Ellerman Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/ras.c | 52 ++++++++++++++-------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 452dcfd7e5dd1..a25c2ac0c9c08 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -539,44 +539,44 @@ static void pseries_print_mce_info(struct pt_regs *regs, int disposition = rtas_error_disposition(errp); static const char * const initiators[] = { - "Unknown", - "CPU", - "PCI", - "ISA", - "Memory", - "Power Mgmt", + [0] = "Unknown", + [1] = "CPU", + [2] = "PCI", + [3] = "ISA", + [4] = "Memory", + [5] = "Power Mgmt", }; static const char * const mc_err_types[] = { - "UE", - "SLB", - "ERAT", - "Unknown", - "TLB", - "D-Cache", - "Unknown", - "I-Cache", + [0] = "UE", + [1] = "SLB", + [2] = "ERAT", + [3] = "Unknown", + [4] = "TLB", + [5] = "D-Cache", + [6] = "Unknown", + [7] = "I-Cache", }; static const char * const mc_ue_types[] = { - "Indeterminate", - "Instruction fetch", - "Page table walk ifetch", - "Load/Store", - "Page table walk Load/Store", + [0] = "Indeterminate", + [1] = "Instruction fetch", + [2] = "Page table walk ifetch", + [3] = "Load/Store", + [4] = "Page table walk Load/Store", }; /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ static const char * const mc_slb_types[] = { - "Parity", - "Multihit", - "Indeterminate", + [0] = "Parity", + [1] = "Multihit", + [2] = "Indeterminate", }; /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ static const char * const mc_soft_types[] = { - "Unknown", - "Parity", - "Multihit", - "Indeterminate", + [0] = "Unknown", + [1] = "Parity", + [2] = "Multihit", + [3] = "Indeterminate", }; if (!rtas_error_extended(errp)) { From 24c174bb23ebcbe4c3979855b220513f2b3a730f Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 11 Feb 2019 12:37:12 +0100 Subject: [PATCH 011/205] powerpc/configs: Enable CONFIG_USB_XHCI_HCD by default Recent versions of QEMU provide a XHCI device by default these days instead of an old-fashioned OHCI device: https://git.qemu.org/?p=qemu.git;a=commitdiff;h=57040d451315320b7d27 So to get the keyboard working in the graphical console there again, we should now include XHCI support in the kernel by default, too. Signed-off-by: Thomas Huth Reviewed-by: David Gibson Acked-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/pseries_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index ea79c519863df..62e12f61a3b24 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -217,6 +217,7 @@ CONFIG_USB_MON=m CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_HCD_PPC_OF is not set CONFIG_USB_OHCI_HCD=y +CONFIG_USB_XHCI_HCD=y CONFIG_USB_STORAGE=m CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=m From f89bd8ba834e392ff614a7be9ee68c5679675122 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 9 Apr 2019 09:33:28 +0530 Subject: [PATCH 012/205] powerpc/mm/radix: Don't do SLB preload when using the radix MMU Add radix_enabled() check to avoid SLB preload with radix translation. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dd9e0d5386ee7..f7b2e3b3db285 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1729,7 +1729,8 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ #ifdef CONFIG_PPC_BOOK3S_64 - preload_new_slb_context(start, sp); + if (!radix_enabled()) + preload_new_slb_context(start, sp); #endif #endif From f172acbfae1a78b1a3c775f78e8d0dcd15b9d768 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Wed, 13 Mar 2019 11:25:28 +0100 Subject: [PATCH 013/205] powerpc/mm: move warning from resize_hpt_for_hotplug() resize_hpt_for_hotplug() reports a warning when it cannot resize the hash page table ("Unable to resize hash page table to target order") but in some cases it's not a problem and can make user thinks something has not worked properly. This patch moves the warning to arch_remove_memory() to only report the problem when it is needed. Reviewed-by: David Gibson Signed-off-by: Laurent Vivier Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/sparsemem.h | 4 ++-- arch/powerpc/mm/hash_utils_64.c | 19 +++++++------------ arch/powerpc/mm/mem.c | 3 ++- arch/powerpc/platforms/pseries/lpar.c | 3 ++- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 68da493205922..3192d454a733a 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -17,9 +17,9 @@ extern int create_section_mapping(unsigned long start, unsigned long end, int ni extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_PPC_BOOK3S_64 -extern void resize_hpt_for_hotplug(unsigned long new_mem_size); +extern int resize_hpt_for_hotplug(unsigned long new_mem_size); #else -static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { } +static inline int resize_hpt_for_hotplug(unsigned long new_mem_size) { return 0; } #endif #ifdef CONFIG_NUMA diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0a4f939a8161e..c4c9610ce6e3c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -755,12 +755,12 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG -void resize_hpt_for_hotplug(unsigned long new_mem_size) +int resize_hpt_for_hotplug(unsigned long new_mem_size) { unsigned target_hpt_shift; if (!mmu_hash_ops.resize_hpt) - return; + return 0; target_hpt_shift = htab_shift_for_mem_size(new_mem_size); @@ -772,16 +772,11 @@ void resize_hpt_for_hotplug(unsigned long new_mem_size) * reduce unless the target shift is at least 2 below the * current shift */ - if ((target_hpt_shift > ppc64_pft_size) - || (target_hpt_shift < (ppc64_pft_size - 1))) { - int rc; - - rc = mmu_hash_ops.resize_hpt(target_hpt_shift); - if (rc && (rc != -ENODEV)) - printk(KERN_WARNING - "Unable to resize hash page table to target order %d: %d\n", - target_hpt_shift, rc); - } + if (target_hpt_shift > ppc64_pft_size || + target_hpt_shift < ppc64_pft_size - 1) + return mmu_hash_ops.resize_hpt(target_hpt_shift); + + return 0; } int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index f6787f90e1585..3665602a9dfa7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -161,7 +161,8 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, */ vm_unmap_aliases(); - resize_hpt_for_hotplug(memblock_phys_mem_size()); + if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) + pr_warn("Hash collision while resizing HPT\n"); return ret; } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f2a9f0adc2d32..1034ef1fe2b45 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -901,8 +901,10 @@ static int pseries_lpar_resize_hpt(unsigned long shift) break; case H_PARAMETER: + pr_warn("Invalid argument from H_RESIZE_HPT_PREPARE\n"); return -EINVAL; case H_RESOURCE: + pr_warn("Operation not permitted from H_RESIZE_HPT_PREPARE\n"); return -EPERM; default: pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc); @@ -918,7 +920,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift) if (rc != 0) { switch (state.commit_rc) { case H_PTEG_FULL: - pr_warn("Hash collision while resizing HPT\n"); return -ENOSPC; default: From bff25143da0d623a1765bf78dbc82044e46da5a4 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 7 Mar 2019 09:40:31 -0500 Subject: [PATCH 014/205] powerpc/mm: Silence unused-but-set-variable warnings pte_unmap() compiles away on some powerpc platforms, so silence the warnings below by making it a static inline function. mm/memory.c: In function 'copy_pte_range': mm/memory.c:820:24: warning: variable 'orig_dst_pte' set but not used mm/memory.c:820:9: warning: variable 'orig_src_pte' set but not used mm/madvise.c: In function 'madvise_free_pte_range': mm/madvise.c:318:9: warning: variable 'orig_pte' set but not used mm/swap_state.c: In function 'swap_ra_info': mm/swap_state.c:634:15: warning: variable 'orig_pte' set but not used Suggested-by: Christophe Leroy Signed-off-by: Qian Cai Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++- arch/powerpc/include/asm/nohash/64/pgtable.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 581f91be9dd44..e3d18b3f6e5d4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -992,7 +992,8 @@ extern struct page *pgd_page(pgd_t pgd); (((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr)) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while(0) + +static inline void pte_unmap(pte_t *pte) { } /* to find an entry in a kernel page-table-directory */ /* This now only contains the vmalloc pages */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index e77ed97616327..0384a3302fb60 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -205,7 +205,8 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val) (((pte_t *) pmd_page_vaddr(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while(0) + +static inline void pte_unmap(pte_t *pte) { } /* to find an entry in a kernel page-table-directory */ /* This now only contains the vmalloc pages */ From c05f57fdc34a3d00c9ee28a35772e9d11b5ce100 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 6 Apr 2019 22:48:08 -0400 Subject: [PATCH 015/205] powerpc/pseries/iommu: Fix set but not used values The commit b7d6bf4fdd47 ("powerpc/pseries/pci: Remove obsolete SW invalidate") left 2 variables unused. arch/powerpc/platforms/pseries/iommu.c:108:17: warning: variable 'tces' set but not used __be64 *tcep, *tces; ^~~~ arch/powerpc/platforms/pseries/iommu.c:132:17: warning: variable 'tces' set but not used __be64 *tcep, *tces; ^~~~ Also, the commit 68c0449ea16d ("powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation") set "ranges" in ddw_memory_hotplug_max() but never use it. arch/powerpc/platforms/pseries/iommu.c: In function 'ddw_memory_hotplug_max': arch/powerpc/platforms/pseries/iommu.c:948:7: warning: variable 'ranges' set but not used int ranges, n_mem_addr_cells, n_mem_size_cells, len; ^~~~~~ Signed-off-by: Qian Cai Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 36eb1ddbac695..03bbb299320e2 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -105,7 +105,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, unsigned long attrs) { u64 proto_tce; - __be64 *tcep, *tces; + __be64 *tcep; u64 rpn; proto_tce = TCE_PCI_READ; // Read allowed @@ -113,7 +113,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; - tces = tcep = ((__be64 *)tbl->it_base) + index; + tcep = ((__be64 *)tbl->it_base) + index; while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ @@ -129,9 +129,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) { - __be64 *tcep, *tces; + __be64 *tcep; - tces = tcep = ((__be64 *)tbl->it_base) + index; + tcep = ((__be64 *)tbl->it_base) + index; while (npages--) *(tcep++) = 0; @@ -945,7 +945,7 @@ static phys_addr_t ddw_memory_hotplug_max(void) for_each_node_by_type(memory, "memory") { unsigned long start, size; - int ranges, n_mem_addr_cells, n_mem_size_cells, len; + int n_mem_addr_cells, n_mem_size_cells, len; const __be32 *memcell_buf; memcell_buf = of_get_property(memory, "reg", &len); @@ -955,9 +955,6 @@ static phys_addr_t ddw_memory_hotplug_max(void) n_mem_addr_cells = of_n_addr_cells(memory); n_mem_size_cells = of_n_size_cells(memory); - /* ranges in cell */ - ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); - start = of_read_number(memcell_buf, n_mem_addr_cells); memcell_buf += n_mem_addr_cells; size = of_read_number(memcell_buf, n_mem_size_cells); From e663e1e06089773cdab03023563aead65cfed042 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Sat, 6 Apr 2019 21:54:47 -0400 Subject: [PATCH 016/205] powerpc/pseries/pmem: Fix a set but not used value The commit 4c5d87db4978 ("powerpc/pseries: PAPR persistent memory support") set a local variable "count" in dlpar_hp_pmem() but never use it. arch/powerpc/platforms/pseries/pmem.c: In function 'dlpar_hp_pmem': arch/powerpc/platforms/pseries/pmem.c:109:6: warning: variable 'count' set but not used Signed-off-by: Qian Cai Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/pmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index 27f0a915c8a97..f860a897a9e05 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -106,7 +106,7 @@ static ssize_t pmem_drc_remove_node(u32 drc_index) int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) { - u32 count, drc_index; + u32 drc_index; int rc; /* slim chance, but we might get a hotplug event while booting */ @@ -123,7 +123,6 @@ int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) return -EINVAL; } - count = hp_elog->_drc_u.drc_count; drc_index = hp_elog->_drc_u.drc_index; lock_device_hotplug(); From 679d03f26a686dd609afd94fd7caad7d406eb36e Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 10 Apr 2019 18:38:46 -0700 Subject: [PATCH 017/205] MAINTAINERS: Remove non-existent VAS file The file arch/powerpc/include/uapi/asm/vas.h was considered but never merged and should be removed from the MAINTAINERS file. While here, add missing email address. Reported-by: Joe Perches Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 43b36dbed48e7..7db05f94412ec 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7385,13 +7385,12 @@ S: Supported F: drivers/net/ethernet/ibm/ibmvnic.* IBM Power Virtual Accelerator Switchboard -M: Sukadev Bhattiprolu +M: Sukadev Bhattiprolu L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/platforms/powernv/vas* F: arch/powerpc/platforms/powernv/copy-paste.h F: arch/powerpc/include/asm/vas.h -F: arch/powerpc/include/uapi/asm/vas.h IBM Power Virtual Ethernet Device Driver M: Thomas Falcon From 0235854e1c25069d14512a35ea5cd05372e31fd6 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 11 Apr 2019 06:27:52 +0200 Subject: [PATCH 018/205] MAINTAINERS: Update remaining @linux.vnet.ibm.com addresses Paul McKenney attempted to update all email addresses @linux.vnet.ibm.com to @linux.ibm.com in commit 1dfddcdb95c4 ("MAINTAINERS: Update from @linux.vnet.ibm.com to @linux.ibm.com"), but some still remained. We update the remaining email addresses in MAINTAINERS, hopefully finally catching all cases for good. Fixes: 1dfddcdb95c4 ("MAINTAINERS: Update from @linux.vnet.ibm.com to @linux.ibm.com") Signed-off-by: Lukas Bulwahn Acked-by: Paul E. McKenney Signed-off-by: Michael Ellerman --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7db05f94412ec..e9e3783fd3236 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7437,14 +7437,14 @@ F: drivers/crypto/vmx/ghash* F: drivers/crypto/vmx/ppc-xlate.pl IBM Power PCI Hotplug Driver for RPA-compliant PPC64 platform -M: Tyrel Datwyler +M: Tyrel Datwyler L: linux-pci@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Supported F: drivers/pci/hotplug/rpaphp* IBM Power IO DLPAR Driver for RPA-compliant PPC64 platform -M: Tyrel Datwyler +M: Tyrel Datwyler L: linux-pci@vger.kernel.org L: linuxppc-dev@lists.ozlabs.org S: Supported @@ -10386,7 +10386,7 @@ F: arch/arm/mach-mmp/ MMU GATHER AND TLB INVALIDATION M: Will Deacon -M: "Aneesh Kumar K.V" +M: "Aneesh Kumar K.V" M: Andrew Morton M: Nick Piggin M: Peter Zijlstra From 4df2cb633b5b22ba152511f1a55e718efca6c0d9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 23 Feb 2019 14:20:34 +0100 Subject: [PATCH 019/205] powerpc/83xx: Add missing of_node_put() after of_device_is_available() Add an of_node_put() when a tested device node is not available. Fixes: c026c98739c7e ("powerpc/83xx: Do not configure or probe disabled FSL DR USB controllers") Signed-off-by: Julia Lawall Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/83xx/usb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/83xx/usb.c b/arch/powerpc/platforms/83xx/usb.c index 5c31d8292d3bb..e7c2c3fb011a6 100644 --- a/arch/powerpc/platforms/83xx/usb.c +++ b/arch/powerpc/platforms/83xx/usb.c @@ -221,8 +221,10 @@ int mpc837x_usb_cfg(void) int ret = 0; np = of_find_compatible_node(NULL, NULL, "fsl-usb2-dr"); - if (!np || !of_device_is_available(np)) + if (!np || !of_device_is_available(np)) { + of_node_put(np); return -ENODEV; + } prop = of_get_property(np, "phy_type", NULL); if (!prop || (strcmp(prop, "ulpi") && strcmp(prop, "serial"))) { From 7f177f9810ada8ec2e8b378eddbe2d91fda79c9b Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Mon, 15 Apr 2019 15:35:44 +0530 Subject: [PATCH 020/205] powerpc/pseries: hwpoison the pages upon hitting UE Add support to hwpoison the pages upon hitting machine check exception. This patch queues the address where UE is hit to percpu array and schedules work to plumb it into memory poison infrastructure. Reviewed-by: Mahesh Salgaonkar Signed-off-by: Ganesh Goudar [mpe: Combine #ifdefs, drop PPC_BIT8(), and empty inline stub] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 1 + arch/powerpc/kernel/mce_power.c | 2 +- arch/powerpc/platforms/pseries/ras.c | 83 ++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 17996bc9382b4..ad47fa8653240 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -210,6 +210,7 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest); +unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 6b800eec31f2f..367fbfa2e835d 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -36,7 +36,7 @@ * Convert an address related to an mm to a PFN. NOTE: we are in real * mode, we could potentially race with page table updates. */ -static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; unsigned long flags; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index a25c2ac0c9c08..c97d15352f9ff 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -707,6 +707,87 @@ static int mce_handle_error(struct rtas_error_log *errp) return disposition; } +#ifdef CONFIG_MEMORY_FAILURE + +static DEFINE_PER_CPU(int, rtas_ue_count); +static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); + +#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 +#define UE_LOGICAL_ADDR_PROVIDED 0x20 + + +static void pseries_hwpoison_work_fn(struct work_struct *work) +{ + unsigned long paddr; + int index; + + while (__this_cpu_read(rtas_ue_count) > 0) { + index = __this_cpu_read(rtas_ue_count) - 1; + paddr = __this_cpu_read(rtas_ue_paddr[index]); + memory_failure(paddr >> PAGE_SHIFT, 0); + __this_cpu_dec(rtas_ue_count); + } +} + +static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); + +static void queue_ue_paddr(unsigned long paddr) +{ + int index; + + index = __this_cpu_inc_return(rtas_ue_count) - 1; + if (index >= MAX_MC_EVT) { + __this_cpu_dec(rtas_ue_count); + return; + } + this_cpu_write(rtas_ue_paddr[index], paddr); + schedule_work(&hwpoison_work); +} + +static void pseries_do_memory_failure(struct pt_regs *regs, + struct pseries_mc_errorlog *mce_log) +{ + unsigned long paddr; + + if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { + paddr = be64_to_cpu(mce_log->logical_address); + } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { + unsigned long pfn; + + pfn = addr_to_pfn(regs, + be64_to_cpu(mce_log->effective_address)); + if (pfn == ULONG_MAX) + return; + paddr = pfn << PAGE_SHIFT; + } else { + return; + } + queue_ue_paddr(paddr); +} + +static void pseries_process_ue(struct pt_regs *regs, + struct rtas_error_log *errp) +{ + struct pseries_errorlog *pseries_log; + struct pseries_mc_errorlog *mce_log; + + if (!rtas_error_extended(errp)) + return; + + pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); + if (!pseries_log) + return; + + mce_log = (struct pseries_mc_errorlog *)pseries_log->data; + + if (mce_log->error_type == MC_ERROR_TYPE_UE) + pseries_do_memory_failure(regs, mce_log); +} +#else +static inline void pseries_process_ue(struct pt_regs *regs, + struct rtas_error_log *errp) { } +#endif /*CONFIG_MEMORY_FAILURE */ + /* * Process MCE rtas errlog event. */ @@ -765,6 +846,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) recovered = 1; } + pseries_process_ue(regs, err); + /* Queue irq work to log this rtas event later. */ irq_work_queue(&mce_errlog_process_work); From cc76404feaed597bb4f5234d34d3f49e2d1139bf Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 26 Mar 2019 18:29:51 +0800 Subject: [PATCH 021/205] powerpc/8xx: Fix possible device node reference leak The call to of_find_compatible_node() returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. irq_domain_add_linear() also calls of_node_get() to increase refcount, so irq_domain() will not be affected when it is released. Detected by coccinelle. Fixes: a8db8cf0d894 ("irq_domain: Replace irq_alloc_host() with revmap-specific initializers") Signed-off-by: Wen Yang Suggested-by: Christophe Leroy Suggested-by: Michael Ellerman Reviewed-by: Peng Hao Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/8xx/pic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/8xx/pic.c b/arch/powerpc/platforms/8xx/pic.c index 8d5a25d43ef32..e9617d35fd1f6 100644 --- a/arch/powerpc/platforms/8xx/pic.c +++ b/arch/powerpc/platforms/8xx/pic.c @@ -153,10 +153,9 @@ int mpc8xx_pic_init(void) if (mpc8xx_pic_host == NULL) { printk(KERN_ERR "MPC8xx PIC: failed to allocate irq host!\n"); ret = -ENOMEM; - goto out; } - return 0; + ret = 0; out: of_node_put(np); return ret; From 6917735e8f905da1f62ccdf62830b185524835c7 Mon Sep 17 00:00:00 2001 From: Jagadeesh Pagadala Date: Sat, 23 Mar 2019 18:20:55 +0530 Subject: [PATCH 022/205] powerpc: Remove duplicate headers Remove duplicate headers inclusions. Signed-off-by: Jagadeesh Pagadala Reviewed-by: Mukesh Ojha Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/numa.c | 1 - 3 files changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc0503ef9c9ce..6ef32472ee1d9 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 506413a2c25e4..587ff9788ab05 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f976676004ad0..f3ee0e18edd61 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include From 80d04b7fabe161a23d143b3bfcfca1b002c23da1 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 21 Mar 2019 10:42:22 +0000 Subject: [PATCH 023/205] powerpc/crypto: Use cheaper random numbers for crc-vpmsum self-test This code was filling a 64K buffer from /dev/urandom in order to compute a CRC over (on average half of) it by two different methods, comparing the CRCs, and repeating. This is not a remotely security-critical application, so use the far faster and cheaper prandom_u32() generator. And, while we're at it, only fill as much of the buffer as we plan to use. Signed-off-by: George Spelvin Acked-by: Daniel Axtens Signed-off-by: Michael Ellerman --- arch/powerpc/crypto/crc-vpmsum_test.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/crypto/crc-vpmsum_test.c b/arch/powerpc/crypto/crc-vpmsum_test.c index 0153a9c6f4af5..98ea4f4d3dde8 100644 --- a/arch/powerpc/crypto/crc-vpmsum_test.c +++ b/arch/powerpc/crypto/crc-vpmsum_test.c @@ -78,16 +78,12 @@ static int __init crc_test_init(void) pr_info("crc-vpmsum_test begins, %lu iterations\n", iterations); for (i=0; i Date: Thu, 14 Mar 2019 15:27:27 +1100 Subject: [PATCH 024/205] powerpc/powernv: Squash sparse warnings in opal-call.c sparse complains a lot about opal-call.c: arch/powerpc/platforms/powernv/opal-call.c:128:1: warning: symbol 'opal_invalid_call' was not declared. Should it be static? arch/powerpc/platforms/powernv/opal-call.c:129:1: warning: symbol 'opal_console_write' was not declared. Should it be static? arch/powerpc/platforms/powernv/opal-call.c:130:1: warning: symbol 'opal_console_read' was not declared. Should it be static? Those symbols are forward declared in opal.h, but we can't include that because the function signatures in opal.h are different. So instead, just add an extra forward declaration to the OPAL_CALL macro to shut sparse up. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-call.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index daad8c45c8e72..c53773a149e06 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -120,6 +120,8 @@ static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, } #define OPAL_CALL(name, opcode) \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7); \ int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ int64_t a4, int64_t a5, int64_t a6, int64_t a7) \ { \ From 2d4d9b308f8f8dec68f6dbbff18c68ec7c6bd26f Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 18 Apr 2019 13:56:57 -0500 Subject: [PATCH 025/205] powerpc/numa: improve control of topology updates When booted with "topology_updates=no", or when "off" is written to /proc/powerpc/topology_updates, NUMA reassignments are inhibited for PRRN and VPHN events. However, migration and suspend unconditionally re-enable reassignments via start_topology_update(). This is incoherent. Check the topology_updates_enabled flag in start/stop_topology_update() so that callers of those APIs need not be aware of whether reassignments are enabled. This allows the administrative decision on reassignments to remain in force across migrations and suspensions. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f3ee0e18edd61..952ada44df629 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1497,6 +1497,9 @@ int start_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; @@ -1530,6 +1533,9 @@ int stop_topology_update(void) { int rc = 0; + if (!topology_updates_enabled) + return 0; + if (prrn_enabled) { prrn_enabled = 0; #ifdef CONFIG_SMP @@ -1587,11 +1593,13 @@ static ssize_t topology_write(struct file *file, const char __user *buf, kbuf[read_len] = '\0'; - if (!strncmp(kbuf, "on", 2)) + if (!strncmp(kbuf, "on", 2)) { + topology_updates_enabled = true; start_topology_update(); - else if (!strncmp(kbuf, "off", 3)) + } else if (!strncmp(kbuf, "off", 3)) { stop_topology_update(); - else + topology_updates_enabled = false; + } else return -EINVAL; return count; @@ -1606,9 +1614,7 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - /* Do not poll for changes if disabled at boot */ - if (topology_updates_enabled) - start_topology_update(); + start_topology_update(); if (vphn_enabled) topology_schedule_update(); From 558f86493df09f68f79fe056d9028d317a3ce8ab Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 18 Apr 2019 13:56:58 -0500 Subject: [PATCH 026/205] powerpc/numa: document topology_updates_enabled, disable by default Changing the NUMA associations for CPUs and memory at runtime is basically unsupported by the core mm, scheduler etc. We see all manner of crashes, warnings and instability when the pseries code tries to do this. Disable this behavior by default, and document the switch a bit. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 952ada44df629..6ef36d553cdee 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -907,16 +907,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); -static bool topology_updates_enabled = true; +/* + * The platform can inform us through one of several mechanisms + * (post-migration device tree updates, PRRN or VPHN) that the NUMA + * assignment of a resource has changed. This controls whether we act + * on that. Disabled by default. + */ +static bool topology_updates_enabled; static int __init early_topology_updates(char *p) { if (!p) return 0; - if (!strcmp(p, "off")) { - pr_info("Disabling topology updates\n"); - topology_updates_enabled = false; + if (!strcmp(p, "on")) { + pr_warn("Caution: enabling topology updates\n"); + topology_updates_enabled = true; } return 0; From c1fe190c06723322f2dfac31d3b982c581e434ef Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 1 Apr 2019 17:03:12 +1100 Subject: [PATCH 027/205] powerpc: Add force enable of DAWR on P9 option This adds a flag so that the DAWR can be enabled on P9 via: echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous The DAWR was previously force disabled on POWER9 in: 9654153158 powerpc: Disable DAWR in the base POWER9 CPU features Also see Documentation/powerpc/DAWR-POWER9.txt This is a dangerous setting, USE AT YOUR OWN RISK. Some users may not care about a bad user crashing their box (ie. single user/desktop systems) and really want the DAWR. This allows them to force enable DAWR. This flag can also be used to disable DAWR access. Once this is cleared, all DAWR access should be cleared immediately and your machine once again safe from crashing. Userspace may get confused by toggling this. If DAWR is force enabled/disabled between getting the number of breakpoints (via PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an inconsistent view of what's available. Similarly for guests. For the DAWR to be enabled in a KVM guest, the DAWR needs to be force enabled in the host AND the guest. For this reason, this won't work on POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the dawr_enable_dangerous file will fail if the hypervisor doesn't support writing the DAWR. To double check the DAWR is working, run this kernel selftest: tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c Any errors/failures/skips mean something is wrong. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- Documentation/powerpc/DAWR-POWER9.txt | 32 ++++++++++++ arch/powerpc/include/asm/hw_breakpoint.h | 8 +++ arch/powerpc/kernel/hw_breakpoint.c | 62 +++++++++++++++++++++++- arch/powerpc/kernel/process.c | 9 ++-- arch/powerpc/kernel/ptrace.c | 3 +- arch/powerpc/kvm/book3s_hv.c | 3 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 23 +++++---- 7 files changed, 123 insertions(+), 17 deletions(-) diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt index 2feaa66196585..bdec03650941e 100644 --- a/Documentation/powerpc/DAWR-POWER9.txt +++ b/Documentation/powerpc/DAWR-POWER9.txt @@ -56,3 +56,35 @@ POWER9. Loads and stores to the watchpoint locations will not be trapped in GDB. The watchpoint is remembered, so if the guest is migrated back to the POWER8 host, it will start working again. +Force enabling the DAWR +============================= +Kernels (since ~v5.2) have an option to force enable the DAWR via: + + echo Y > /sys/kernel/debug/powerpc/dawr_enable_dangerous + +This enables the DAWR even on POWER9. + +This is a dangerous setting, USE AT YOUR OWN RISK. + +Some users may not care about a bad user crashing their box +(ie. single user/desktop systems) and really want the DAWR. This +allows them to force enable DAWR. + +This flag can also be used to disable DAWR access. Once this is +cleared, all DAWR access should be cleared immediately and your +machine once again safe from crashing. + +Userspace may get confused by toggling this. If DAWR is force +enabled/disabled between getting the number of breakpoints (via +PTRACE_GETHWDBGINFO) and setting the breakpoint, userspace will get an +inconsistent view of what's available. Similarly for guests. + +For the DAWR to be enabled in a KVM guest, the DAWR needs to be force +enabled in the host AND the guest. For this reason, this won't work on +POWERVM as it doesn't allow the HCALL to work. Writes of 'Y' to the +dawr_enable_dangerous file will fail if the hypervisor doesn't support +writing the DAWR. + +To double check the DAWR is working, run this kernel selftest: + tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +Any errors/failures/skips mean something is wrong. diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h index ece4dc89c90be..0fe8c1e46bbcb 100644 --- a/arch/powerpc/include/asm/hw_breakpoint.h +++ b/arch/powerpc/include/asm/hw_breakpoint.h @@ -90,10 +90,18 @@ static inline void hw_breakpoint_disable(void) extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs); int hw_breakpoint_handler(struct die_args *args); +extern int set_dawr(struct arch_hw_breakpoint *brk); +extern bool dawr_force_enable; +static inline bool dawr_enabled(void) +{ + return dawr_force_enable; +} + #else /* CONFIG_HAVE_HW_BREAKPOINT */ static inline void hw_breakpoint_disable(void) { } static inline void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) { } +static inline bool dawr_enabled(void) { return false; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif /* __KERNEL__ */ #endif /* _PPC_BOOK3S_64_HW_BREAKPOINT_H */ diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index fec8a6773119f..da307dd93ee38 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -29,11 +29,15 @@ #include #include #include +#include +#include #include #include #include #include +#include +#include #include /* @@ -174,7 +178,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, if (!ppc_breakpoint_available()) return -ENODEV; length_max = 8; /* DABR */ - if (cpu_has_feature(CPU_FTR_DAWR)) { + if (dawr_enabled()) { length_max = 512 ; /* 64 doublewords */ /* DAWR region can't cross 512 boundary */ if ((attr->bp_addr >> 9) != @@ -376,3 +380,59 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } + +bool dawr_force_enable; +EXPORT_SYMBOL_GPL(dawr_force_enable); + +static ssize_t dawr_write_file_bool(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct arch_hw_breakpoint null_brk = {0, 0, 0}; + size_t rc; + + /* Send error to user if they hypervisor won't allow us to write DAWR */ + if ((!dawr_force_enable) && + (firmware_has_feature(FW_FEATURE_LPAR)) && + (set_dawr(&null_brk) != H_SUCCESS)) + return -1; + + rc = debugfs_write_file_bool(file, user_buf, count, ppos); + if (rc) + return rc; + + /* If we are clearing, make sure all CPUs have the DAWR cleared */ + if (!dawr_force_enable) + smp_call_function((smp_call_func_t)set_dawr, &null_brk, 0); + + return rc; +} + +static const struct file_operations dawr_enable_fops = { + .read = debugfs_read_file_bool, + .write = dawr_write_file_bool, + .open = simple_open, + .llseek = default_llseek, +}; + +static int __init dawr_force_setup(void) +{ + dawr_force_enable = false; + + if (cpu_has_feature(CPU_FTR_DAWR)) { + /* Don't setup sysfs file for user control on P8 */ + dawr_force_enable = true; + return 0; + } + + if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { + /* Turn DAWR off by default, but allow admin to turn it on */ + dawr_force_enable = false; + debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, + powerpc_debugfs_root, + &dawr_force_enable, + &dawr_enable_fops); + } + return 0; +} +arch_initcall(dawr_force_setup); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dd9e0d5386ee7..225705aac814b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -784,7 +785,7 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk) return __set_dabr(dabr, dabrx); } -static inline int set_dawr(struct arch_hw_breakpoint *brk) +int set_dawr(struct arch_hw_breakpoint *brk) { unsigned long dawr, dawrx, mrd; @@ -816,7 +817,7 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk) { memcpy(this_cpu_ptr(¤t_brk), brk, sizeof(*brk)); - if (cpu_has_feature(CPU_FTR_DAWR)) + if (dawr_enabled()) // Power8 or later set_dawr(brk); else if (!cpu_has_feature(CPU_FTR_ARCH_207S)) @@ -830,8 +831,8 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk) /* Check if we have DAWR or DABR hardware */ bool ppc_breakpoint_available(void) { - if (cpu_has_feature(CPU_FTR_DAWR)) - return true; /* POWER8 DAWR */ + if (dawr_enabled()) + return true; /* POWER8 DAWR or POWER9 forced DAWR */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) return false; /* POWER9 with DAWR disabled */ /* DABR: Everything but POWER8 and POWER9 */ diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index d9ac7d94656ee..684b0b315c327 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -3088,7 +3089,7 @@ long arch_ptrace(struct task_struct *child, long request, dbginfo.sizeof_condition = 0; #ifdef CONFIG_HAVE_HW_BREAKPOINT dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; - if (cpu_has_feature(CPU_FTR_DAWR)) + if (dawr_enabled()) dbginfo.features |= PPC_DEBUG_FEATURE_DATA_BP_DAWR; #else dbginfo.features = 0; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 06964350b97a9..0fab0a2010272 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -74,6 +74,7 @@ #include #include #include +#include #include "book3s.h" @@ -3374,7 +3375,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_PURR, vcpu->arch.purr); mtspr(SPRN_SPURR, vcpu->arch.spurr); - if (cpu_has_feature(CPU_FTR_DAWR)) { + if (dawr_enabled()) { mtspr(SPRN_DAWR, vcpu->arch.dawr); mtspr(SPRN_DAWRX, vcpu->arch.dawrx); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3a5e719ef032b..139027c62dc2f 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -822,18 +822,21 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_IAMR, r5 mtspr SPRN_PSPB, r6 mtspr SPRN_FSCR, r7 - ld r5, VCPU_DAWR(r4) - ld r6, VCPU_DAWRX(r4) - ld r7, VCPU_CIABR(r4) - ld r8, VCPU_TAR(r4) /* * Handle broken DAWR case by not writing it. This means we * can still store the DAWR register for migration. */ -BEGIN_FTR_SECTION + LOAD_REG_ADDR(r5, dawr_force_enable) + lbz r5, 0(r5) + cmpdi r5, 0 + beq 1f + ld r5, VCPU_DAWR(r4) + ld r6, VCPU_DAWRX(r4) mtspr SPRN_DAWR, r5 mtspr SPRN_DAWRX, r6 -END_FTR_SECTION_IFSET(CPU_FTR_DAWR) +1: + ld r7, VCPU_CIABR(r4) + ld r8, VCPU_TAR(r4) mtspr SPRN_CIABR, r7 mtspr SPRN_TAR, r8 ld r5, VCPU_IC(r4) @@ -2513,11 +2516,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) blr 2: -BEGIN_FTR_SECTION - /* POWER9 with disabled DAWR */ + LOAD_REG_ADDR(r11, dawr_force_enable) + lbz r11, 0(r11) + cmpdi r11, 0 li r3, H_HARDWARE - blr -END_FTR_SECTION_IFCLR(CPU_FTR_DAWR) + beqlr /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW rlwimi r5, r4, 2, DAWRX_WT From a3f3072db6cad40895c585dce65e36aab997f042 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:16 +1000 Subject: [PATCH 028/205] powerpc/powernv/idle: Restore IAMR after idle Without restoring the IAMR after idle, execution prevention on POWER9 with Radix MMU is overwritten and the kernel can freely execute userspace without faulting. This is necessary when returning from any stop state that modifies user state, as well as hypervisor state. To test how this fails without this patch, load the lkdtm driver and do the following: $ echo EXEC_USERSPACE > /sys/kernel/debug/provoke-crash/DIRECT which won't fault, then boot the kernel with powersave=off, where it will fault. Applying this patch will fix this. Fixes: 3b10d0095a1e ("powerpc/mm/radix: Prevent kernel execution of user space") Cc: stable@vger.kernel.org # v4.10+ Signed-off-by: Russell Currey Reviewed-by: Akshay Adiga Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 7f5ac2e8581be..36178000a2f25 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -170,6 +170,9 @@ core_idle_lock_held: bne- core_idle_lock_held blr +/* Reuse an unused pt_regs slot for IAMR */ +#define PNV_POWERSAVE_IAMR _DAR + /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 @@ -200,6 +203,12 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) + +BEGIN_FTR_SECTION + mfspr r5, SPRN_IAMR + std r5, PNV_POWERSAVE_IAMR(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfcr r5 std r5,_CCR(r1) std r1,PACAR1(r13) @@ -924,6 +933,17 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + +BEGIN_FTR_SECTION + /* IAMR was saved in pnv_powersave_common() */ + ld r5, PNV_POWERSAVE_IAMR(r1) + mtspr SPRN_IAMR, r5 + /* + * We don't need an isync here because the upcoming mtmsrd is + * execution synchronizing. + */ +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + ld r4,PACAKMSR(r13) ld r5,_LINK(r1) ld r6,_CCR(r1) From 53a712bae5dd919521a58d7bad773b949358add0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Apr 2019 16:51:17 +1000 Subject: [PATCH 029/205] powerpc/powernv/idle: Restore AMR/UAMOR/AMOR after idle In order to implement KUAP (Kernel Userspace Access Protection) on Power9 we will be using the AMR, and therefore indirectly the UAMOR/AMOR. So save/restore these regs in the idle code. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 36178000a2f25..4a860d3b92296 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -170,8 +170,11 @@ core_idle_lock_held: bne- core_idle_lock_held blr -/* Reuse an unused pt_regs slot for IAMR */ +/* Reuse some unused pt_regs slots for AMR/IAMR/UAMOR/UAMOR */ +#define PNV_POWERSAVE_AMR _TRAP #define PNV_POWERSAVE_IAMR _DAR +#define PNV_POWERSAVE_UAMOR _DSISR +#define PNV_POWERSAVE_AMOR RESULT /* * Pass requested state in r3: @@ -205,8 +208,16 @@ pnv_powersave_common: SAVE_NVGPRS(r1) BEGIN_FTR_SECTION + mfspr r4, SPRN_AMR mfspr r5, SPRN_IAMR + mfspr r6, SPRN_UAMOR + std r4, PNV_POWERSAVE_AMR(r1) std r5, PNV_POWERSAVE_IAMR(r1) + std r6, PNV_POWERSAVE_UAMOR(r1) +BEGIN_FTR_SECTION_NESTED(42) + mfspr r7, SPRN_AMOR + std r7, PNV_POWERSAVE_AMOR(r1) +END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mfcr r5 @@ -935,12 +946,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_GPR(2, r1) BEGIN_FTR_SECTION - /* IAMR was saved in pnv_powersave_common() */ + /* These regs were saved in pnv_powersave_common() */ + ld r4, PNV_POWERSAVE_AMR(r1) ld r5, PNV_POWERSAVE_IAMR(r1) + ld r6, PNV_POWERSAVE_UAMOR(r1) + mtspr SPRN_AMR, r4 mtspr SPRN_IAMR, r5 + mtspr SPRN_UAMOR, r6 +BEGIN_FTR_SECTION_NESTED(42) + ld r7, PNV_POWERSAVE_AMOR(r1) + mtspr SPRN_AMOR, r7 +END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42) /* - * We don't need an isync here because the upcoming mtmsrd is - * execution synchronizing. + * We don't need an isync here after restoring IAMR because the upcoming + * mtmsrd is execution synchronizing. */ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) From 69795cabe4cfe5122438d50010ad5310c113a013 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:18 +1000 Subject: [PATCH 030/205] powerpc: Add framework for Kernel Userspace Protection This patch adds a skeleton for Kernel Userspace Protection functionnalities like Kernel Userspace Access Protection and Kernel Userspace Execution Prevention The subsequent implementation of KUAP for radix makes use of a MMU feature in order to patch out assembly when KUAP is disabled or unsupported. This won't work unless there's an entry point for KUP support before the feature magic happens, so for PPC64 setup_kup() is called early in setup. On PPC32, feature_fixup() is done too early to allow the same. Suggested-by: Russell Currey Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kup.h | 11 +++++++++++ arch/powerpc/kernel/setup_64.c | 7 +++++++ arch/powerpc/mm/init-common.c | 5 +++++ arch/powerpc/mm/init_32.c | 3 +++ 4 files changed, 26 insertions(+) create mode 100644 arch/powerpc/include/asm/kup.h diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h new file mode 100644 index 0000000000000..7a88b8b9b54d4 --- /dev/null +++ b/arch/powerpc/include/asm/kup.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_H_ +#define _ASM_POWERPC_KUP_H_ + +#ifndef __ASSEMBLY__ + +void setup_kup(void); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ba404dd9ce1d8..6179c42003399 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "setup.h" @@ -331,6 +332,12 @@ void __init early_setup(unsigned long dt_ptr) */ configure_exceptions(); + /* + * Configure Kernel Userspace Protection. This needs to happen before + * feature fixups for platforms that implement this using features. + */ + setup_kup(); + /* Apply all the dynamic patching */ apply_feature_fixups(); setup_feature_keys(); diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 1e6910eb70ed1..36d28e8722898 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -24,6 +24,11 @@ #include #include #include +#include + +void __init setup_kup(void) +{ +} #define CTOR(shift) static void ctor_##shift(void *addr) \ { \ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 41a3513cadc90..80cc97cd88782 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -178,6 +179,8 @@ void __init MMU_init(void) btext_unmap(); #endif + setup_kup(); + /* Shortly after that, the entire linear mapping will be available */ memblock_set_current_limit(lowmem_end_addr); } From 0fb1c25ab523614b056ace11be67aac8f8ccabb1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:19 +1000 Subject: [PATCH 031/205] powerpc: Add skeleton for Kernel Userspace Execution Prevention This patch adds a skeleton for Kernel Userspace Execution Prevention. Then subarches implementing it have to define CONFIG_PPC_HAVE_KUEP and provide setup_kuep() function. Signed-off-by: Christophe Leroy [mpe: Don't split strings, use pr_crit_ratelimited()] Signed-off-by: Michael Ellerman --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/kup.h | 6 ++++++ arch/powerpc/mm/fault.c | 9 ++++----- arch/powerpc/mm/init-common.c | 11 +++++++++++ arch/powerpc/platforms/Kconfig.cputype | 12 ++++++++++++ 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb6447..a53df74589e52 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2843,7 +2843,7 @@ Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [X86] + nosmep [X86,PPC] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 7a88b8b9b54d4..a2a959cb4e367 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,6 +6,12 @@ void setup_kup(void); +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled); +#else +static inline void setup_kuep(bool disabled) { } +#endif /* CONFIG_PPC_KUEP */ + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 887f11bcf3300..3384354abc1d2 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -229,11 +229,10 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code, /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | DSISR_PROTFAULT))) { - printk_ratelimited(KERN_CRIT "kernel tried to execute" - " exec-protected page (%lx) -" - "exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, - current_uid())); + pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", + address >= TASK_SIZE ? "exec-protected" : "user", + address, + from_kuid(&init_user_ns, current_uid())); } return is_exec || (address >= TASK_SIZE); } diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 36d28e8722898..83f95a5565d67 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -26,8 +26,19 @@ #include #include +static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); + +static int __init parse_nosmep(char *p) +{ + disable_kuep = true; + pr_warn("Disabling Kernel Userspace Execution Prevention\n"); + return 0; +} +early_param("nosmep", parse_nosmep); + void __init setup_kup(void) { + setup_kuep(disable_kuep); } #define CTOR(shift) static void ctor_##shift(void *addr) \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156ab..7d30bbbaa3c1a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -345,6 +345,18 @@ config PPC_RADIX_MMU_DEFAULT If you're unsure, say Y. +config PPC_HAVE_KUEP + bool + +config PPC_KUEP + bool "Kernel Userspace Execution Prevention" + depends on PPC_HAVE_KUEP + default y + help + Enable support for Kernel Userspace Execution Prevention (KUEP) + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION From de78a9c42a790011f179bc94a7da3f5d8721f4cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Apr 2019 16:51:20 +1000 Subject: [PATCH 032/205] powerpc: Add a framework for Kernel Userspace Access Protection This patch implements a framework for Kernel Userspace Access Protection. Then subarches will have the possibility to provide their own implementation by providing setup_kuap() and allow/prevent_user_access(). Some platforms will need to know the area accessed and whether it is accessed from read, write or both. Therefore source, destination and size and handed over to the two functions. mpe: Rename to allow/prevent rather than unlock/lock, and add read/write wrappers. Drop the 32-bit code for now until we have an implementation for it. Add kuap to pt_regs for 64-bit as well as 32-bit. Don't split strings, use pr_crit_ratelimited(). Signed-off-by: Christophe Leroy Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- .../admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/futex.h | 4 ++ arch/powerpc/include/asm/kup.h | 32 ++++++++++++++++ arch/powerpc/include/asm/ptrace.h | 11 +++++- arch/powerpc/include/asm/uaccess.h | 38 +++++++++++++++---- arch/powerpc/kernel/asm-offsets.c | 4 ++ arch/powerpc/lib/checksum_wrappers.c | 4 ++ arch/powerpc/mm/fault.c | 19 ++++++++-- arch/powerpc/mm/init-common.c | 10 +++++ arch/powerpc/platforms/Kconfig.cputype | 12 ++++++ 10 files changed, 121 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a53df74589e52..c45a19d654f32 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2839,7 +2839,7 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings - nosmap [X86] + nosmap [X86,PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 88b38b37c21b1..3a6aa57b9d901 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -35,6 +35,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, { int oldval = 0, ret; + allow_write_to_user(uaddr, sizeof(*uaddr)); pagefault_disable(); switch (op) { @@ -62,6 +63,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, if (!ret) *oval = oldval; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } @@ -75,6 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; + allow_write_to_user(uaddr, sizeof(*uaddr)); __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ @@ -95,6 +98,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "cc", "memory"); *uval = prev; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index a2a959cb4e367..4d78b9d8c99c0 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +#include + void setup_kup(void); #ifdef CONFIG_PPC_KUEP @@ -12,6 +14,36 @@ void setup_kuep(bool disabled); static inline void setup_kuep(bool disabled) { } #endif /* CONFIG_PPC_KUEP */ +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled); +#else +static inline void setup_kuap(bool disabled) { } +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) { } +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) { } +#endif /* CONFIG_PPC_KUAP */ + +static inline void allow_read_from_user(const void __user *from, unsigned long size) +{ + allow_user_access(NULL, from, size); +} + +static inline void allow_write_to_user(void __user *to, unsigned long size) +{ + allow_user_access(to, NULL, size); +} + +static inline void prevent_read_from_user(const void __user *from, unsigned long size) +{ + prevent_user_access(NULL, from, size); +} + +static inline void prevent_write_to_user(void __user *to, unsigned long size) +{ + prevent_user_access(to, NULL, size); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_KUP_H_ */ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 64271e562fed3..6f047730e642e 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -52,10 +52,17 @@ struct pt_regs }; }; + union { + struct { #ifdef CONFIG_PPC64 - unsigned long ppr; - unsigned long __pad; /* Maintain 16 byte interrupt stack alignment */ + unsigned long ppr; +#endif +#ifdef CONFIG_PPC_KUAP + unsigned long kuap; #endif + }; + unsigned long __pad[2]; /* Maintain 16 byte interrupt stack alignment */ + }; }; #endif diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 4d6d905e91380..76f34346b642d 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * The fs value determines whether argument validity checking should be @@ -140,6 +141,7 @@ extern long __put_user_bad(void); #define __put_user_size(x, ptr, size, retval) \ do { \ retval = 0; \ + allow_write_to_user(ptr, size); \ switch (size) { \ case 1: __put_user_asm(x, ptr, retval, "stb"); break; \ case 2: __put_user_asm(x, ptr, retval, "sth"); break; \ @@ -147,6 +149,7 @@ do { \ case 8: __put_user_asm2(x, ptr, retval); break; \ default: __put_user_bad(); \ } \ + prevent_write_to_user(ptr, size); \ } while (0) #define __put_user_nocheck(x, ptr, size) \ @@ -239,6 +242,7 @@ do { \ __chk_user_ptr(ptr); \ if (size > sizeof(x)) \ (x) = __get_user_bad(); \ + allow_read_from_user(ptr, size); \ switch (size) { \ case 1: __get_user_asm(x, ptr, retval, "lbz"); break; \ case 2: __get_user_asm(x, ptr, retval, "lhz"); break; \ @@ -246,6 +250,7 @@ do { \ case 8: __get_user_asm2(x, ptr, retval); break; \ default: (x) = __get_user_bad(); \ } \ + prevent_read_from_user(ptr, size); \ } while (0) /* @@ -305,15 +310,21 @@ extern unsigned long __copy_tofrom_user(void __user *to, static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - return __copy_tofrom_user(to, from, n); + unsigned long ret; + + allow_user_access(to, from, n); + ret = __copy_tofrom_user(to, from, n); + prevent_user_access(to, from, n); + return ret; } #endif /* __powerpc64__ */ static inline unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long ret; if (__builtin_constant_p(n) && (n <= 8)) { - unsigned long ret = 1; + ret = 1; switch (n) { case 1: @@ -338,14 +349,18 @@ static inline unsigned long raw_copy_from_user(void *to, } barrier_nospec(); - return __copy_tofrom_user((__force void __user *)to, from, n); + allow_read_from_user(from, n); + ret = __copy_tofrom_user((__force void __user *)to, from, n); + prevent_read_from_user(from, n); + return ret; } static inline unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { + unsigned long ret; if (__builtin_constant_p(n) && (n <= 8)) { - unsigned long ret = 1; + ret = 1; switch (n) { case 1: @@ -365,17 +380,24 @@ static inline unsigned long raw_copy_to_user(void __user *to, return 0; } - return __copy_tofrom_user(to, (__force const void __user *)from, n); + allow_write_to_user(to, n); + ret = __copy_tofrom_user(to, (__force const void __user *)from, n); + prevent_write_to_user(to, n); + return ret; } extern unsigned long __clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) { + unsigned long ret = size; might_fault(); - if (likely(access_ok(addr, size))) - return __clear_user(addr, size); - return size; + if (likely(access_ok(addr, size))) { + allow_write_to_user(addr, size); + ret = __clear_user(addr, size); + prevent_write_to_user(addr, size); + } + return ret; } extern long strncpy_from_user(char *dst, const char __user *src, long count); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 86a61e5f8285b..66202e02fee26 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -332,6 +332,10 @@ int main(void) STACK_PT_REGS_OFFSET(_PPR, ppr); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC_KUAP + STACK_PT_REGS_OFFSET(STACK_REGS_KUAP, kuap); +#endif + #if defined(CONFIG_PPC32) #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index 890d4ddd91d61..bb9307ce2440d 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -29,6 +29,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, unsigned int csum; might_sleep(); + allow_read_from_user(src, len); *err_ptr = 0; @@ -60,6 +61,7 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, } out: + prevent_read_from_user(src, len); return (__force __wsum)csum; } EXPORT_SYMBOL(csum_and_copy_from_user); @@ -70,6 +72,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, unsigned int csum; might_sleep(); + allow_write_to_user(dst, len); *err_ptr = 0; @@ -97,6 +100,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, } out: + prevent_write_to_user(dst, len); return (__force __wsum)csum; } EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 3384354abc1d2..463d1e9d026e9 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -223,9 +223,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, } /* Is this a bad kernel fault ? */ -static bool bad_kernel_fault(bool is_exec, unsigned long error_code, +static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { + int is_exec = TRAP(regs) == 0x400; + /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | DSISR_PROTFAULT))) { @@ -234,7 +236,15 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code, address, from_kuid(&init_user_ns, current_uid())); } - return is_exec || (address >= TASK_SIZE); + + if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && + !search_exception_tables(regs->nip)) { + pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n", + address, + from_kuid(&init_user_ns, current_uid())); + } + + return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip); } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, @@ -454,9 +464,10 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, /* * The kernel should never take an execute fault nor should it - * take a page fault to a kernel address. + * take a page fault to a kernel address or a page fault to a user + * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address))) return SIGSEGV; /* diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 83f95a5565d67..ecaedfff99921 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -27,6 +27,7 @@ #include static bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP); +static bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP); static int __init parse_nosmep(char *p) { @@ -36,9 +37,18 @@ static int __init parse_nosmep(char *p) } early_param("nosmep", parse_nosmep); +static int __init parse_nosmap(char *p) +{ + disable_kuap = true; + pr_warn("Disabling Kernel Userspace Access Protection\n"); + return 0; +} +early_param("nosmap", parse_nosmap); + void __init setup_kup(void) { setup_kuep(disable_kuep); + setup_kuap(disable_kuap); } #define CTOR(shift) static void ctor_##shift(void *addr) \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7d30bbbaa3c1a..457fc3a5ed93d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -357,6 +357,18 @@ config PPC_KUEP If you're unsure, say Y. +config PPC_HAVE_KUAP + bool + +config PPC_KUAP + bool "Kernel Userspace Access Protection" + depends on PPC_HAVE_KUAP + default y + help + Enable support for Kernel Userspace Access Protection (KUAP) + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION From b28c97505eb1a5265e367c398c3406be6ce5e313 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:21 +1000 Subject: [PATCH 033/205] powerpc/64: Setup KUP on secondary CPUs Some platforms (i.e. Radix MMU) need per-CPU initialisation for KUP. Any platforms that only want to do KUP initialisation once globally can just check to see if they're running on the boot CPU, or check if whatever setup they need has already been performed. Note that this is only for 64-bit. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 3 +++ arch/powerpc/mm/init-common.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6179c42003399..684e34493bf5a 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -390,6 +390,9 @@ void early_setup_secondary(void) /* Initialize the hash table or TLB handling */ early_init_mmu_secondary(); + /* Perform any KUP setup that is per-cpu */ + setup_kup(); + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index ecaedfff99921..6ea5607fc5646 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); -void __init setup_kup(void) +void setup_kup(void) { setup_kuep(disable_kuep); setup_kuap(disable_kuap); From 1bb2bae2e6c7d94e3bc1fdce06baf31b8d811ed6 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:22 +1000 Subject: [PATCH 034/205] powerpc/mm/radix: Use KUEP API for Radix MMU Execution protection already exists on radix, this just refactors the radix init to provide the KUEP setup function instead. Thus, the only functional change is that it can now be disabled. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 12 +++++++++--- arch/powerpc/platforms/Kconfig.cputype | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 154472a28c77b..8616b291bcecc 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -531,8 +531,15 @@ static void radix_init_amor(void) mtspr(SPRN_AMOR, (3ul << 62)); } -static void radix_init_iamr(void) +#ifdef CONFIG_PPC_KUEP +void setup_kuep(bool disabled) { + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) + pr_info("Activating Kernel Userspace Execution Prevention\n"); + /* * Radix always uses key0 of the IAMR to determine if an access is * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction @@ -540,6 +547,7 @@ static void radix_init_iamr(void) */ mtspr(SPRN_IAMR, (1ul << 62)); } +#endif void __init radix__early_init_mmu(void) { @@ -601,7 +609,6 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_iamr(); radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); @@ -623,7 +630,6 @@ void radix__early_init_mmu_secondary(void) __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); radix_init_amor(); } - radix_init_iamr(); radix__switch_mmu_context(NULL, &init_mm); if (cpu_has_feature(CPU_FTR_HVMODE)) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 457fc3a5ed93d..60371784c9f15 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -326,6 +326,7 @@ config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select PPC_HAVE_KUEP default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this From ef296729b735e083d8919e76ac213b8ff237eb78 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 18 Apr 2019 16:51:23 +1000 Subject: [PATCH 035/205] powerpc/lib: Refactor __patch_instruction() to use __put_user_asm() __patch_instruction() is called in early boot, and uses __put_user_size(), which includes the allow/prevent calls to enforce KUAP, which could either be called too early, or in the Radix case, forced to use "early_" versions of functions just to safely handle this one case. __put_user_asm() does not do this, and thus is safe to use both in early boot, and later on since in this case it should only ever be touching kernel memory. __patch_instruction() was previously refactored to use __put_user_size() in order to be able to return -EFAULT, which would allow the kernel to patch instructions in userspace, which should never happen. This has the functional change of causing faults on userspace addresses if KUAP is turned on, which should never happen in practice. A future enhancement could be to double check the patch address is definitely allowed to be tampered with by the kernel. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/lib/code-patching.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 587ff9788ab05..90c9d4a1e36fb 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -25,9 +25,9 @@ static int __patch_instruction(unsigned int *exec_addr, unsigned int instr, unsigned int *patch_addr) { - int err; + int err = 0; - __put_user_size(instr, patch_addr, 4, err); + __put_user_asm(instr, patch_addr, err, "stw"); if (err) return err; From 890274c2dc4c0a57ae5a12d6a76fa6d05b599d98 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Apr 2019 16:51:24 +1000 Subject: [PATCH 036/205] powerpc/64s: Implement KUAP for Radix MMU Kernel Userspace Access Prevention utilises a feature of the Radix MMU which disallows read and write access to userspace addresses. By utilising this, the kernel is prevented from accessing user data from outside of trusted paths that perform proper safety checks, such as copy_{to/from}_user() and friends. Userspace access is disabled from early boot and is only enabled when performing an operation like copy_{to/from}_user(). The register that controls this (AMR) does not prevent userspace from accessing itself, so there is no need to save and restore when entering and exiting userspace. When entering the kernel from the kernel we save AMR and if it is not blocking user access (because eg. we faulted doing a user access) we reblock user access for the duration of the exception (ie. the page fault) and then restore the AMR when returning back to the kernel. This feature can be tested by using the lkdtm driver (CONFIG_LKDTM=y) and performing the following: # (echo ACCESS_USERSPACE) > [debugfs]/provoke-crash/DIRECT If enabled, this should send SIGSEGV to the thread. We also add paranoid checking of AMR in switch and syscall return under CONFIG_PPC_KUAP_DEBUG. Co-authored-by: Michael Ellerman Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- .../powerpc/include/asm/book3s/64/kup-radix.h | 102 ++++++++++++++++++ arch/powerpc/include/asm/exception-64s.h | 2 + arch/powerpc/include/asm/feature-fixups.h | 3 + arch/powerpc/include/asm/kup.h | 4 + arch/powerpc/include/asm/mmu.h | 10 +- arch/powerpc/kernel/entry_64.S | 27 ++++- arch/powerpc/kernel/exceptions-64s.S | 3 + arch/powerpc/mm/pgtable-radix.c | 19 ++++ arch/powerpc/mm/pkeys.c | 1 + arch/powerpc/platforms/Kconfig.cputype | 8 ++ 10 files changed, 176 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/include/asm/book3s/64/kup-radix.h diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h new file mode 100644 index 0000000000000..6d6628424134b --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H + +#include + +#define AMR_KUAP_BLOCK_READ UL(0x4000000000000000) +#define AMR_KUAP_BLOCK_WRITE UL(0x8000000000000000) +#define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE) +#define AMR_KUAP_SHIFT 62 + +#ifdef __ASSEMBLY__ + +.macro kuap_restore_amr gpr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + ld \gpr, STACK_REGS_KUAP(r1) + mtspr SPRN_AMR, \gpr + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_check_amr gpr1, gpr2 +#ifdef CONFIG_PPC_KUAP_DEBUG + BEGIN_MMU_FTR_SECTION_NESTED(67) + mfspr \gpr1, SPRN_AMR + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi \gpr2, \gpr2, AMR_KUAP_SHIFT +999: tdne \gpr1, \gpr2 + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + .ifnb \msr_pr_cr + bne \msr_pr_cr, 99f + .endif + mfspr \gpr1, SPRN_AMR + std \gpr1, STACK_REGS_KUAP(r1) + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi \gpr2, \gpr2, AMR_KUAP_SHIFT + cmpd \use_cr, \gpr1, \gpr2 + beq \use_cr, 99f + // We don't isync here because we very recently entered via rfid + mtspr SPRN_AMR, \gpr2 + isync +99: + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +/* + * We support individually allowing read or write, but we don't support nesting + * because that would require an expensive read/modify write of the AMR. + */ + +static inline void set_kuap(unsigned long value) +{ + if (!mmu_has_feature(MMU_FTR_RADIX_KUAP)) + return; + + /* + * ISA v3.0B says we need a CSI (Context Synchronising Instruction) both + * before and after the move to AMR. See table 6 on page 1134. + */ + isync(); + mtspr(SPRN_AMR, value); + isync(); +} + +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + // This is written so we can resolve to a single case at build time + if (__builtin_constant_p(to) && to == NULL) + set_kuap(AMR_KUAP_BLOCK_WRITE); + else if (__builtin_constant_p(from) && from == NULL) + set_kuap(AMR_KUAP_BLOCK_READ); + else + set_kuap(0); +} + +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + set_kuap(AMR_KUAP_BLOCKED); +} + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 937bb630093f2..bef4e05a6823d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -497,6 +497,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) RESTORE_CTR(r1, area); \ b bad_stack; \ 3: EXCEPTION_PROLOG_COMMON_1(); \ + kuap_save_amr_and_lock r9, r10, cr1, cr0; \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ SAVE_PPR(area, r9); \ @@ -691,6 +692,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) */ #define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \ EXCEPTION_PROLOG_COMMON_1(); \ + kuap_save_amr_and_lock r9, r10, cr1; \ EXCEPTION_PROLOG_COMMON_2(area); \ EXCEPTION_PROLOG_COMMON_3(trap); \ /* Volatile regs are potentially clobbered here */ \ diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 40a6c9261a6bf..f6fc31f8baffa 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -100,6 +100,9 @@ label##5: \ #define END_MMU_FTR_SECTION(msk, val) \ END_MMU_FTR_SECTION_NESTED(msk, val, 97) +#define END_MMU_FTR_SECTION_NESTED_IFSET(msk, label) \ + END_MMU_FTR_SECTION_NESTED((msk), (msk), label) + #define END_MMU_FTR_SECTION_IFSET(msk) END_MMU_FTR_SECTION((msk), (msk)) #define END_MMU_FTR_SECTION_IFCLR(msk) END_MMU_FTR_SECTION((msk), 0) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 4d78b9d8c99c0..d7312defbe1c2 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -2,6 +2,10 @@ #ifndef _ASM_POWERPC_KUP_H_ #define _ASM_POWERPC_KUP_H_ +#ifdef CONFIG_PPC64 +#include +#endif + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8ddd4a91bdc1e..38d21adfde400 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -107,6 +107,11 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +/* + * Supports KUAP (key 0 controlling userspace addresses) on radix + */ +#define MMU_FTR_RADIX_KUAP ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -164,7 +169,10 @@ enum { #endif #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | -#endif +#ifdef CONFIG_PPC_KUAP + MMU_FTR_RADIX_KUAP | +#endif /* CONFIG_PPC_KUAP */ +#endif /* CONFIG_PPC_RADIX_MMU */ 0, }; diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15c67d2c05343..7cc25389c6bd9 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -46,6 +46,7 @@ #include #endif #include +#include /* * System calls. @@ -120,6 +121,9 @@ END_BTB_FLUSH_SECTION addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ + + kuap_check_amr r10, r11 + #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION beq 33f @@ -275,6 +279,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) + kuap_check_amr r10, r11 + #ifdef CONFIG_PPC_BOOK3S /* * Clear MSR_RI, MSR_EE is already and remains disabled. We could do @@ -296,6 +302,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r8, PACATMSCRATCH(r13) #endif + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ ld r2,GPR2(r1) ld r1,GPR1(r1) @@ -306,8 +316,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI_TO_USER b . /* prevent speculative execution */ - /* exit to kernel */ -1: ld r2,GPR2(r1) +1: /* exit to kernel */ + kuap_restore_amr r2 + + ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 @@ -594,6 +606,8 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ + kuap_check_amr r9, r10 + FLUSH_COUNT_CACHE /* @@ -942,6 +956,8 @@ fast_exception_return: ld r4,_XER(r1) mtspr SPRN_XER,r4 + kuap_check_amr r5, r6 + REST_8GPRS(5, r1) andi. r0,r3,MSR_RI @@ -974,6 +990,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -1006,6 +1026,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r0,GPR0(r1) ld r2,GPR2(r1) ld r3,GPR3(r1) + + kuap_restore_amr r4 + ld r4,GPR4(r1) ld r1,GPR1(r1) RFI_TO_KERNEL diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9481a117e2425..bedd89438827b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -19,6 +19,7 @@ #include #include #include +#include /* * There are a few constraints to be concerned with. @@ -309,6 +310,7 @@ TRAMP_REAL_BEGIN(machine_check_common_early) mfspr r11,SPRN_DSISR /* Save DSISR */ std r11,_DSISR(r1) std r9,_CCR(r1) /* Save CR in stackframe */ + kuap_save_amr_and_lock r9, r10, cr1 /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) mfmsr r11 /* get MSR value */ @@ -1109,6 +1111,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ EXCEPTION_PROLOG_COMMON_1() + /* We don't touch AMR here, we never go to virtual mode */ EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 8616b291bcecc..45869fd698a0c 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -549,6 +550,24 @@ void setup_kuep(bool disabled) } #endif +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled) +{ + if (disabled || !early_radix_enabled()) + return; + + if (smp_processor_id() == boot_cpuid) { + pr_info("Activating Kernel Userspace Access Prevention\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP; + } + + /* Make sure userspace can't change the AMR */ + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_AMR, AMR_KUAP_BLOCKED); + isync(); +} +#endif + void __init radix__early_init_mmu(void) { unsigned long lpcr; diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index 5878077637379..ae7fca40e5b37 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 60371784c9f15..5e53b9fd62aa9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -327,6 +327,7 @@ config PPC_RADIX_MMU depends on PPC_BOOK3S_64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select PPC_HAVE_KUEP + select PPC_HAVE_KUAP default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this @@ -370,6 +371,13 @@ config PPC_KUAP If you're unsure, say Y. +config PPC_KUAP_DEBUG + bool "Extra debugging for Kernel Userspace Access Protection" + depends on PPC_HAVE_KUAP && PPC_RADIX_MMU + help + Add extra debugging for Kernel Userspace Access Protection (KUAP) + If you're unsure, say N. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION From 5e5be3aed23032d40d5ab7407f344f1a74f2765b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 18 Apr 2019 16:51:25 +1000 Subject: [PATCH 037/205] powerpc/mm: Detect bad KUAP faults When KUAP is enabled we have logic to detect page faults that occur outside of a valid user access region and are blocked by the AMR. What we don't have at the moment is logic to detect a fault *within* a valid user access region, that has been incorrectly blocked by AMR. This is not meant to ever happen, but it can if we incorrectly save/restore the AMR, or if the AMR was overwritten for some other reason. Currently if that happens we assume it's just a regular fault that will be corrected by handling the fault normally, so we just return. But there is nothing the fault handling code can do to fix it, so the fault just happens again and we spin forever, leading to soft lockups. So add some logic to detect that case and WARN() if we ever see it. Arguably it should be a BUG(), but it's more polite to fail the access and let the kernel continue, rather than taking down the box. There should be no data integrity issue with failing the fault rather than BUG'ing, as we're just going to disallow an access that should have been allowed. To make the code a little easier to follow, unroll the condition at the end of bad_kernel_fault() and comment each case, before adding the call to bad_kuap_fault(). Signed-off-by: Michael Ellerman --- .../powerpc/include/asm/book3s/64/kup-radix.h | 6 +++++ arch/powerpc/include/asm/kup.h | 1 + arch/powerpc/mm/fault.c | 25 ++++++++++++++++--- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 6d6628424134b..7679bd0c5af0d 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -95,6 +95,12 @@ static inline void prevent_user_access(void __user *to, const void __user *from, set_kuap(AMR_KUAP_BLOCKED); } +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) && + (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)), + "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); +} #endif /* CONFIG_PPC_KUAP */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index d7312defbe1c2..28ad4654eed25 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -26,6 +26,7 @@ static inline void allow_user_access(void __user *to, const void __user *from, unsigned long size) { } static inline void prevent_user_access(void __user *to, const void __user *from, unsigned long size) { } +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) { return false; } #endif /* CONFIG_PPC_KUAP */ static inline void allow_read_from_user(const void __user *from, unsigned long size) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 463d1e9d026e9..b5d3578d9f658 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -44,6 +44,7 @@ #include #include #include +#include static inline bool notify_page_fault(struct pt_regs *regs) { @@ -224,7 +225,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, /* Is this a bad kernel fault ? */ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, bool is_write) { int is_exec = TRAP(regs) == 0x400; @@ -235,6 +236,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, address >= TASK_SIZE ? "exec-protected" : "user", address, from_kuid(&init_user_ns, current_uid())); + + // Kernel exec fault is always bad + return true; } if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && @@ -244,7 +248,22 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, from_kuid(&init_user_ns, current_uid())); } - return is_exec || (address >= TASK_SIZE) || !search_exception_tables(regs->nip); + // Kernel fault on kernel address is bad + if (address >= TASK_SIZE) + return true; + + // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad + if (!search_exception_tables(regs->nip)) + return true; + + // Read/write fault in a valid region (the exception table search passed + // above), but blocked by KUAP is bad, it can never succeed. + if (bad_kuap_fault(regs, is_write)) + return true; + + // What's left? Kernel fault on user in well defined regions (extable + // matched), and allowed by KUAP in the faulting context. + return false; } static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, @@ -467,7 +486,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, * take a page fault to a kernel address or a page fault to a user * address outside of dedicated places */ - if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address))) + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) return SIGSEGV; /* From e291b6d575bc6e4d1d36961b081be521a6c800d6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:30 +0000 Subject: [PATCH 038/205] powerpc/32: Remove MSR_PR test when returning from syscall syscalls are from user only, so we can account time without checking whether returning to kernel or user as it will only be user. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index b61cfd29c76f1..aaf7c5f44823f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -422,12 +422,7 @@ BEGIN_FTR_SECTION lwarx r7,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - andi. r4,r8,MSR_PR - beq 3f ACCOUNT_CPU_USER_EXIT(r2, r5, r7) -3: -#endif lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 From e2fb9f5444312fd01627c84a3e018c1fe8ac6ebb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:31 +0000 Subject: [PATCH 039/205] powerpc/32: Prepare for Kernel Userspace Access Protection This patch adds ASM macros for saving, restoring and checking the KUAP state, and modifies setup_32 to call them on exceptions from kernel. The macros are defined as empty by default for when CONFIG_PPC_KUAP is not selected and/or for platforms which don't handle (yet) KUAP. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kup.h | 15 ++++++++++++++- arch/powerpc/kernel/entry_32.S | 16 ++++++++++++---- arch/powerpc/platforms/Kconfig.cputype | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 28ad4654eed25..7d8ad3d6729d0 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,7 +6,20 @@ #include #endif -#ifndef __ASSEMBLY__ +#ifdef __ASSEMBLY__ +#ifndef CONFIG_PPC_KUAP +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 +.endm + +.macro kuap_restore sp, current, gpr1, gpr2, gpr3 +.endm + +.macro kuap_check current, gpr +.endm + +#endif + +#else /* !__ASSEMBLY__ */ #include diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index aaf7c5f44823f..1182bf603d3c5 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -36,6 +36,7 @@ #include #include #include +#include /* * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE. @@ -150,8 +151,8 @@ transfer_to_handler: stw r12,_CTR(r11) stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD - addi r2,r12,-THREAD beq 2f /* if from user, fix up THREAD.regs */ + addi r2, r12, -THREAD addi r11,r1,STACK_FRAME_OVERHEAD stw r11,PT_REGS(r12) #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) @@ -186,6 +187,8 @@ transfer_to_handler: 2: /* if from kernel, check interrupted DOZE/NAP mode and * check for stack overflow */ + kuap_save_and_lock r11, r12, r9, r2, r0 + addi r2, r12, -THREAD lwz r9,KSP_LIMIT(r12) cmplw r1,r9 /* if r1 <= ksp_limit */ ble- stack_ovf /* then the kernel stack overflowed */ @@ -272,6 +275,7 @@ reenable_mmu: /* re-enable mmu so we can */ lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ + kuap_restore r11, r2, r3, r4, r5 b fast_exception_return #endif @@ -423,6 +427,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ ACCOUNT_CPU_USER_EXIT(r2, r5, r7) + kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlr r4 @@ -673,6 +678,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) stw r10,_CCR(r1) stw r1,KSP(r3) /* Set old stack pointer */ + kuap_check r2, r4 #ifdef CONFIG_SMP /* We need a sync somewhere here to make sure that if the * previous task gets rescheduled on another CPU, it sees all @@ -861,12 +867,12 @@ resume_kernel: /* check current_thread_info->preempt_count */ lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ - bne restore + bne restore_kuap andi. r8,r8,_TIF_NEED_RESCHED - beq+ restore + beq+ restore_kuap lwz r3,_MSR(r1) andi. r0,r3,MSR_EE /* interrupts off? */ - beq restore /* don't schedule if so */ + beq restore_kuap /* don't schedule if so */ #ifdef CONFIG_TRACE_IRQFLAGS /* Lockdep thinks irqs are enabled, we need to call * preempt_schedule_irq with IRQs off, so we inform lockdep @@ -885,6 +891,8 @@ resume_kernel: bl trace_hardirqs_on #endif #endif /* CONFIG_PREEMPT */ +restore_kuap: + kuap_restore r1, r2, r9, r10, r0 /* interrupts are hard-disabled at this point */ restore: diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 5e53b9fd62aa9..2e45a6e2bc99f 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -373,7 +373,7 @@ config PPC_KUAP config PPC_KUAP_DEBUG bool "Extra debugging for Kernel Userspace Access Protection" - depends on PPC_HAVE_KUAP && PPC_RADIX_MMU + depends on PPC_HAVE_KUAP && (PPC_RADIX_MMU || PPC_32) help Add extra debugging for Kernel Userspace Access Protection (KUAP) If you're unsure, say N. From c341a108a58100b4d0774ddb1dacbd67dfa749b3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:32 +0000 Subject: [PATCH 040/205] powerpc/8xx: Only define APG0 and APG1 Since the 8xx implements hardware page table walk assistance, the PGD entries always point to a 4k aligned page, so the 2 upper bits of the APG are not clobbered anymore and remain 0. Therefore only APG0 and APG1 are used and need a definition. We set the other APG to the lowest permission level. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 0a1a3fc54e540..fc5a653d5dd21 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -35,11 +35,11 @@ * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => No user => 01 (all accesses performed according to page definition) + * 0 => Kernel => 01 (all accesses performed according to page definition) * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * We define all 16 groups so that all other bits of APG can take any value + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) */ -#define MI_APG_INIT 0x44444444 +#define MI_APG_INIT 0x4fffffff /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in @@ -108,11 +108,11 @@ * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => No user => 01 (all accesses performed according to page definition) + * 0 => Kernel => 01 (all accesses performed according to page definition) * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * We define all 16 groups so that all other bits of APG can take any value + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) */ -#define MD_APG_INIT 0x44444444 +#define MD_APG_INIT 0x4fffffff /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in From 06fbe81b5909847aa13f9c86c2b6f9bbc5c2795b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:33 +0000 Subject: [PATCH 041/205] powerpc/8xx: Add Kernel Userspace Execution Prevention This patch adds Kernel Userspace Execution Prevention on the 8xx. When a page is Executable, it is set Executable for Key 0 and NX for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to execute user code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 +++++++ arch/powerpc/mm/8xx_mmu.c | 12 ++++++++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index fc5a653d5dd21..3cb743284e09a 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -41,6 +41,13 @@ */ #define MI_APG_INIT 0x4fffffff +/* + * 0 => Kernel => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MI_APG_KUEP 0x6fffffff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index fe1f6443d57f2..e257a0c9bd080 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -213,3 +213,15 @@ void flush_instruction_cache(void) mtspr(SPRN_IC_CST, IDC_INVALL); isync(); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + if (disabled) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + mtspr(SPRN_MI_AP, MI_APG_KUEP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 2e45a6e2bc99f..00fa0d110dcb2 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -34,6 +34,7 @@ config PPC_8xx bool "Freescale 8xx" select FSL_SOC select SYS_SUPPORTS_HUGETLBFS + select PPC_HAVE_KUEP config 40x bool "AMCC 40x" From 2679f9bd0abafb3044bcbaac0600b32159ac8bf2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:34 +0000 Subject: [PATCH 042/205] powerpc/8xx: Add Kernel Userspace Access Protection This patch adds Kernel Userspace Access Protection on the 8xx. When a page is RO or RW, it is set RO or RW for Key 0 and NA for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to access user data. At exception entry, the kernel saves SPRN_MD_AP in the regs struct, and reapply the protection. At exception exit it restores SPRN_MD_AP with the value saved on exception entry. Signed-off-by: Christophe Leroy [mpe: Drop allow_read/write_to/from_user() as they're now in kup.h] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kup.h | 3 + arch/powerpc/include/asm/nohash/32/kup-8xx.h | 58 ++++++++++++++++++++ arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 +++ arch/powerpc/mm/8xx_mmu.c | 12 ++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 5 files changed, 81 insertions(+) create mode 100644 arch/powerpc/include/asm/nohash/32/kup-8xx.h diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 7d8ad3d6729d0..043c800ec5fbb 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -5,6 +5,9 @@ #ifdef CONFIG_PPC64 #include #endif +#ifdef CONFIG_PPC_8xx +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h new file mode 100644 index 0000000000000..1c3133b5f86aa --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_8XX_H_ +#define _ASM_POWERPC_KUP_8XX_H_ + +#include + +#ifdef CONFIG_PPC_KUAP + +#ifdef __ASSEMBLY__ + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lis \gpr2, MD_APG_KUAP@h /* only APG0 and APG1 are used */ + mfspr \gpr1, SPRN_MD_AP + mtspr SPRN_MD_AP, \gpr2 + stw \gpr1, STACK_REGS_KUAP(\sp) +.endm + +.macro kuap_restore sp, current, gpr1, gpr2, gpr3 + lwz \gpr1, STACK_REGS_KUAP(\sp) + mtspr SPRN_MD_AP, \gpr1 +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + mfspr \gpr, SPRN_MD_AP + rlwinm \gpr, \gpr, 16, 0xffff +999: twnei \gpr, MD_APG_KUAP@h + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#include + +static inline void allow_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_INIT); +} + +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf0000000), + "Bug: fault blocked by AP register !"); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* _ASM_POWERPC_KUP_8XX_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 3cb743284e09a..f620adef54fcd 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -121,6 +121,13 @@ */ #define MD_APG_INIT 0x4fffffff +/* + * 0 => No user => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MD_APG_KUAP 0x6fffffff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index e257a0c9bd080..87648b58d2958 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -225,3 +225,15 @@ void __init setup_kuep(bool disabled) mtspr(SPRN_MI_AP, MI_APG_KUEP); } #endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 00fa0d110dcb2..ab586963893a2 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -35,6 +35,7 @@ config PPC_8xx select FSL_SOC select SYS_SUPPORTS_HUGETLBFS select PPC_HAVE_KUEP + select PPC_HAVE_KUAP config 40x bool "AMCC 40x" From 31ed2b13c48d779efc838ad54e30121e088a62af Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:35 +0000 Subject: [PATCH 043/205] powerpc/32s: Implement Kernel Userspace Execution Prevention. To implement Kernel Userspace Execution Prevention, this patch sets NX bit on all user segments on kernel entry and clears NX bit on all user segments on kernel exit. Note that powerpc 601 doesn't have the NX bit, so KUEP will not work on it. A warning is displayed at startup. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/kup.h | 42 +++++++++++++++++++ arch/powerpc/include/asm/book3s/32/mmu-hash.h | 3 ++ arch/powerpc/include/asm/kup.h | 3 ++ arch/powerpc/kernel/entry_32.S | 9 ++++ arch/powerpc/kernel/head_32.S | 15 ++++++- arch/powerpc/mm/ppc_mmu_32.c | 13 ++++++ arch/powerpc/platforms/Kconfig.cputype | 1 + 7 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/book3s/32/kup.h diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h new file mode 100644 index 0000000000000..5f97c742ca719 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_32_KUP_H +#define _ASM_POWERPC_BOOK3S_32_KUP_H + +#include + +#ifdef __ASSEMBLY__ + +.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi \gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000 /* address of next segment */ + bdnz 101b + isync +.endm + +.macro kuep_lock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + oris \gpr1, \gpr1, SR_NX@h /* set Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +.macro kuep_unlock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 5cb588395fdcf..8c5727a322b13 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -63,6 +63,9 @@ typedef pte_t *pgtable_t; #define PP_RWRW 2 /* Supervisor read/write, User read/write */ #define PP_RXRX 3 /* Supervisor read, User read */ +/* Values for Segment Registers */ +#define SR_NX 0x10000000 /* No Execute */ + #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 043c800ec5fbb..5b5e39643a27c 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -8,6 +8,9 @@ #ifdef CONFIG_PPC_8xx #include #endif +#ifdef CONFIG_PPC_BOOK3S_32 +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1182bf603d3c5..2f3d159c11d79 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -162,6 +162,9 @@ transfer_to_handler: andis. r12,r12,DBCR0_IDM@h #endif ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_lock r11, r12 +#endif #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ @@ -427,6 +430,9 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ ACCOUNT_CPU_USER_EXIT(r2, r5, r7) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r5, r7 +#endif kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) @@ -821,6 +827,9 @@ restore_user: bnel- load_dbcr0 #endif ACCOUNT_CPU_USER_EXIT(r2, r10, r11) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r10, r11 +#endif b restore diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e25b615e9f9e6..19b46cb9f623e 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -896,14 +896,24 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0,16 /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up segment register values */ mtctr r0 /* for context 0 */ lis r3,0x2000 /* Ku = 1, VSID = 0 */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif li r4,0 3: mtsrin r3,r4 addi r3,r3,0x111 /* increment VSID */ addis r4,r4,0x1000 /* address of next segment */ bdnz 3b + li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ + mtctr r0 /* for context 0 */ + rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ @@ -1007,6 +1017,9 @@ _ENTRY(switch_mmu_context) mulli r3,r3,897 /* multiply context by skew factor */ rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ addis r3,r3,0x6000 /* Set Ks, Ku bits */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif li r0,NUM_USER_SEGMENTS mtctr r0 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index f29d2f118b444..baa75673b1f52 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -394,3 +394,16 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, else /* Anything else has 256M mapped */ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + if (cpu_has_feature(CPU_FTR_601)) + pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); + + if (disabled) + pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index ab586963893a2..6bc0a4c08c1c4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -25,6 +25,7 @@ config PPC_BOOK3S_32 bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" select PPC_FPU select PPC_HAVE_PMU_SUPPORT + select PPC_HAVE_KUEP config PPC_85xx bool "Freescale 85xx" From f342adca3afc84c4ef648352440ed6331518d72d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:36 +0000 Subject: [PATCH 044/205] powerpc/32s: Prepare Kernel Userspace Access Protection This patch prepares Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. book3s/32 provides the following values for PP bits: PP00 provides RW for Key 0 and NA for Key 1 PP01 provides RW for Key 0 and RO for Key 1 PP10 provides RW for all PP11 provides RO for all Today PP10 is used for RW pages and PP11 for RO pages, and user segment register's Kp and Ks are set to 1. This patch modifies page protection to use PP01 for RW pages and sets user segment registers to Kp 0 and Ks 0. This will allow to setup Userspace write access protection by settng Ks to 1 in the following patch. Kernel space segment registers remain unchanged. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 ++ arch/powerpc/kernel/head_32.S | 22 +++++++++---------- arch/powerpc/mm/hash_low_32.S | 6 ++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 8c5727a322b13..f9eae105a9f4c 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -65,6 +65,8 @@ typedef pte_t *pgtable_t; /* Values for Segment Registers */ #define SR_NX 0x10000000 /* No Execute */ +#define SR_KP 0x20000000 /* User key */ +#define SR_KS 0x40000000 /* Supervisor key */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 19b46cb9f623e..69b97cc7079f6 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -522,9 +522,9 @@ InstructionTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - ori r1, r1, 0xe05 /* clear out reserved bits */ - andc r1, r0, r1 /* PP = user? 2 : 0 */ + rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + ori r1, r1, 0xe06 /* clear out reserved bits */ + andc r1, r0, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -590,11 +590,11 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r1,r0,32-9,30,30 /* _PAGE_RW -> PP msb */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ + andc r1,r0,r1 /* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -670,9 +670,9 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - li r1,0xe05 /* clear out reserved bits & PP lsb */ - andc r1,r0,r1 /* PP = user? 2: 0 */ + rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + li r1,0xe06 /* clear out reserved bits & PP msb */ + andc r1,r0,r1 /* PP = user? 1: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -896,9 +896,9 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0, NUM_USER_SEGMENTS /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ - lis r3,0x2000 /* Ku = 1, VSID = 0 */ + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ #endif @@ -910,6 +910,7 @@ load_up_mmu: li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ mtctr r0 /* for context 0 */ rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + oris r3, r3, SR_KP@h /* Kp = 1 */ 3: mtsrin r3, r4 addi r3, r3, 0x111 /* increment VSID */ addis r4, r4, 0x1000 /* address of next segment */ @@ -1016,7 +1017,6 @@ _ENTRY(switch_mmu_context) blt- 4f mulli r3,r3,897 /* multiply context by skew factor */ rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ - addis r3,r3,0x6000 /* Set Ks, Ku bits */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ #endif diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index a6c491f18a04e..e27792d0b7444 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -309,13 +309,13 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ ori r8,r8,0xe04 /* clear out reserved bits */ - andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) From a68c31fc01ef7863acc0fc74694bf279456a58c4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 11 Mar 2019 08:30:38 +0000 Subject: [PATCH 045/205] powerpc/32s: Implement Kernel Userspace Access Protection This patch implements Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. The previous patch modifies the page protection so that RW user pages are RW for Key 0 and RO for Key 1, and it sets Key 0 for both user and kernel. This patch changes userspace segment registers are set to Ku 0 and Ks 1. When kernel needs to write to RW pages, the associated segment register is then changed to Ks 0 in order to allow write access to the kernel. In order to avoid having the read all segment registers when locking/unlocking the access, some data is kept in the thread_struct and saved on stack on exceptions. The field identifies both the first unlocked segment and the first segment following the last unlocked one. When no segment is unlocked, it contains value 0. As the hash_page() function is not able to easily determine if a protfault is due to a bad kernel access to userspace, protfaults need to be handled by handle_page_fault when KUAP is set. Signed-off-by: Christophe Leroy [mpe: Drop allow_read/write_to/from_user() as they're now in kup.h, and adapt allow_user_access() to do nothing when to == NULL] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/kup.h | 103 +++++++++++++++++++++++ arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/kernel/asm-offsets.c | 3 + arch/powerpc/kernel/head_32.S | 11 +++ arch/powerpc/mm/ppc_mmu_32.c | 10 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + 6 files changed, 131 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 5f97c742ca719..677e9babef801 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -37,6 +37,109 @@ #endif .endm +#ifdef CONFIG_PPC_KUAP + +.macro kuap_update_sr gpr1, gpr2, gpr3 /* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi \gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ffffff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000 /* address of next segment */ + cmplw \gpr2, \gpr3 + blt- 101b + isync +.endm + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lwz \gpr2, KUAP(\thread) + rlwinm. \gpr3, \gpr2, 28, 0xf0000000 + stw \gpr2, STACK_REGS_KUAP(\sp) + beq+ 102f + li \gpr1, 0 + stw \gpr1, KUAP(\thread) + mfsrin \gpr1, \gpr2 + oris \gpr1, \gpr1, SR_KS@h /* set Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_restore sp, current, gpr1, gpr2, gpr3 + lwz \gpr2, STACK_REGS_KUAP(\sp) + rlwinm. \gpr3, \gpr2, 28, 0xf0000000 + stw \gpr2, THREAD + KUAP(\current) + beq+ 102f + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + lwz \gpr2, KUAP(thread) +999: twnei \gpr, 0 + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#endif /* CONFIG_PPC_KUAP */ + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +static inline void kuap_update_sr(u32 sr, u32 addr, u32 end) +{ + barrier(); /* make sure thread.kuap is updated before playing with SRs */ + while (addr < end) { + mtsrin(sr, addr); + sr += 0x111; /* next VSID */ + sr &= 0xf0ffffff; /* clear VSID overflow */ + addr += 0x10000000; /* address of next segment */ + } + isync(); /* Context sync required after mtsrin() */ +} + +static inline void allow_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr, end; + + if (__builtin_constant_p(to) && to == NULL) + return; + + addr = (__force u32)to; + + if (!addr || addr >= TASK_SIZE || !size) + return; + + end = min(addr + size, TASK_SIZE); + current->thread.kuap = (addr & 0xf0000000) | ((((end - 1) >> 28) + 1) & 0xf); + kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */ +} + +static inline void prevent_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr = (__force u32)to; + u32 end = min(addr + size, TASK_SIZE); + + if (!addr || addr >= TASK_SIZE || !size) + return; + + current->thread.kuap = 0; + kuap_update_sr(mfsrin(addr) | SR_KS, addr, end); /* set Ks */ +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + if (!is_write) + return false; + + return WARN(!regs->kuap, "Bug: write fault blocked by segment registers !"); +} + +#endif /* CONFIG_PPC_KUAP */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3351bcf42f2db..540949b397d43 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -163,6 +163,9 @@ struct thread_struct { #ifdef CONFIG_PPC_RTAS unsigned long rtas_sp; /* stack pointer for when in RTAS */ #endif +#endif +#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) + unsigned long kuap; /* opened segments for user access */ #endif /* Debug Registers */ struct debug_reg debug; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 66202e02fee26..60b82198de7c1 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -147,6 +147,9 @@ int main(void) #if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) OFFSET(THREAD_KVM_VCPU, thread_struct, kvm_vcpu); #endif +#if defined(CONFIG_PPC_BOOK3S_32) && defined(CONFIG_PPC_KUAP) + OFFSET(KUAP, thread_struct, kuap); +#endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM OFFSET(PACATMSCRATCH, paca_struct, tm_scratch); diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 69b97cc7079f6..40aec3f00a052 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -387,7 +387,11 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) +#ifdef CONFIG_PPC_KUAP + andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h +#else andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h +#endif bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ @@ -901,6 +905,9 @@ load_up_mmu: li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ #endif li r4,0 3: mtsrin r3,r4 @@ -910,6 +917,7 @@ load_up_mmu: li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ mtctr r0 /* for context 0 */ rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ oris r3, r3, SR_KP@h /* Kp = 1 */ 3: mtsrin r3, r4 addi r3, r3, 0x111 /* increment VSID */ @@ -1019,6 +1027,9 @@ _ENTRY(switch_mmu_context) rlwinm r3,r3,4,8,27 /* VSID = (context & 0xfffff) << 4 */ #ifdef CONFIG_PPC_KUEP oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ #endif li r0,NUM_USER_SEGMENTS mtctr r0 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index baa75673b1f52..bf1de3ca39bce 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -407,3 +407,13 @@ void __init setup_kuep(bool disabled) pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); } #endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 6xx when compiled in\n"); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6bc0a4c08c1c4..60a7c7095b057 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -26,6 +26,7 @@ config PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP + select PPC_HAVE_KUAP config PPC_85xx bool "Freescale 85xx" From 6161a37307f3320808b5a7549593b991500f2656 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 3 Apr 2019 11:35:14 +0530 Subject: [PATCH 046/205] powerpc/mm: Fix build error with FLATMEM book3s64 config The current value of MAX_PHYSMEM_BITS cannot work with 32 bit configs. We used to have MAX_PHYSMEM_BITS not defined without SPARSEMEM and 32 bit configs never expected a value to be set for MAX_PHYSMEM_BITS. Dependent code such as zsmalloc derived the right values based on other fields. Instead of finding a value that works with different configs, use new values only for book3s_64. For 64 bit booke, use the definition of MAX_PHYSMEM_BITS as per commit a7df61a0e2b6 ("[PATCH] ppc64: Increase sparsemem defaults") That change was done in 2005 and hopefully will work with book3e 64. Fixes: 8bc086899816 ("powerpc/mm: Only define MAX_PHYSMEM_BITS in SPARSEMEM configurations") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 15 +++++++++++++++ arch/powerpc/include/asm/mmu.h | 15 --------------- arch/powerpc/include/asm/nohash/64/mmu.h | 2 ++ 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 1ceee000c18d7..a809bdd773227 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -35,6 +35,21 @@ typedef pte_t *pgtable_t; #endif /* __ASSEMBLY__ */ +/* + * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS + * if we increase SECTIONS_WIDTH we will not store node details in page->flags and + * page_to_nid does a page->section->node lookup + * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce + * memory requirements with large number of sections. + * 51 bits is the max physical real address on POWER9 + */ +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ + defined(CONFIG_PPC_64K_PAGES) +#define MAX_PHYSMEM_BITS 51 +#else +#define MAX_PHYSMEM_BITS 46 +#endif + /* 64-bit classic hash table MMU */ #include diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 38d21adfde400..d86c5641bd971 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -349,21 +349,6 @@ static inline bool strict_kernel_rwx_enabled(void) */ #define MMU_PAGE_COUNT 16 -/* - * If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS - * if we increase SECTIONS_WIDTH we will not store node details in page->flags and - * page_to_nid does a page->section->node lookup - * Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce - * memory requirements with large number of sections. - * 51 bits is the max physical real address on POWER9 - */ -#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ - defined (CONFIG_PPC_64K_PAGES) -#define MAX_PHYSMEM_BITS 51 -#elif defined(CONFIG_PPC64) -#define MAX_PHYSMEM_BITS 46 -#endif - #ifdef CONFIG_PPC_BOOK3S_64 #include #else /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h index e6585480dfc40..81cf30c370e59 100644 --- a/arch/powerpc/include/asm/nohash/64/mmu.h +++ b/arch/powerpc/include/asm/nohash/64/mmu.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_NOHASH_64_MMU_H_ #define _ASM_POWERPC_NOHASH_64_MMU_H_ +#define MAX_PHYSMEM_BITS 44 + /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ #include From 4f40b15f339d896f5726714842947c9339742494 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:47 +0530 Subject: [PATCH 047/205] powerpc/mm: Remove PPC_MM_SLICES #ifdef for book3s64 Book3s64 always have PPC_MM_SLICES enabled. So remove the unncessary #ifdef Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 4 ---- arch/powerpc/include/asm/book3s/64/slice.h | 13 ------------- 2 files changed, 17 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index a809bdd773227..afe10dd11c682 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -138,7 +138,6 @@ typedef struct { /* NPU NMMU context */ struct npu_context *npu_context; -#ifdef CONFIG_PPC_MM_SLICES /* SLB page size encodings*/ unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; @@ -151,9 +150,6 @@ typedef struct { struct slice_mask mask_16m; struct slice_mask mask_16g; # endif -#else - u16 sllp; /* SLB page size encoding */ -#endif unsigned long vdso_base; #ifdef CONFIG_PPC_SUBPAGE_PROT struct subpage_prot_table spt; diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index db0dedab65eec..062e11136e9c4 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -2,8 +2,6 @@ #ifndef _ASM_POWERPC_BOOK3S_64_SLICE_H #define _ASM_POWERPC_BOOK3S_64_SLICE_H -#ifdef CONFIG_PPC_MM_SLICES - #define SLICE_LOW_SHIFT 28 #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) @@ -13,15 +11,4 @@ #define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) -#else /* CONFIG_PPC_MM_SLICES */ - -#define get_slice_psize(mm, addr) ((mm)->context.user_psize) -#define slice_set_user_psize(mm, psize) \ -do { \ - (mm)->context.user_psize = (psize); \ - (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ -} while (0) - -#endif /* CONFIG_PPC_MM_SLICES */ - #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ From 60458fba469a695a026334b364cf8adbcd5807e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:48 +0530 Subject: [PATCH 048/205] powerpc/mm: Add helpers for accessing hash translation related variables We want to switch to allocating them runtime only when hash translation is enabled. Add helpers so that both book3s and nohash can be adapted to upcoming change easily. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 4 +- arch/powerpc/include/asm/book3s/64/mmu.h | 63 ++++++++++++++++++- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 50 +++++++++++++++ arch/powerpc/kernel/paca.c | 12 ++-- arch/powerpc/mm/hash_utils_64.c | 10 +-- arch/powerpc/mm/slb.c | 2 +- arch/powerpc/mm/slice.c | 49 +++++++-------- arch/powerpc/mm/subpage-prot.c | 8 +-- 8 files changed, 154 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index a28a28079edba..eb36fbfe4ef5d 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -657,8 +657,8 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.slb_addr_limit >> 41) - +#define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->slb_addr_limit >> 41) #ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_SUBPAGE_PROT diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index afe10dd11c682..c9f3170906203 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -139,7 +139,7 @@ typedef struct { struct npu_context *npu_context; /* SLB page size encodings*/ - unsigned char low_slices_psize[BITS_PER_LONG / BITS_PER_BYTE]; + unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; unsigned long slb_addr_limit; # ifdef CONFIG_PPC_64K_PAGES @@ -174,6 +174,67 @@ typedef struct { #endif } mm_context_t; +static inline u16 mm_ctx_user_psize(mm_context_t *ctx) +{ + return ctx->user_psize; +} + +static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) +{ + ctx->user_psize = user_psize; +} + +static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) +{ + return ctx->low_slices_psize; +} + +static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) +{ + return ctx->high_slices_psize; +} + +static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) +{ + return ctx->slb_addr_limit; +} + +static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) +{ + ctx->slb_addr_limit = limit; +} + +#ifdef CONFIG_PPC_64K_PAGES +static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx) +{ + return &ctx->mask_64k; +} +#endif + +static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx) +{ + return &ctx->mask_4k; +} + +#ifdef CONFIG_HUGETLB_PAGE +static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx) +{ + return &ctx->mask_16m; +} + +static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) +{ + return &ctx->mask_16g; +} +#endif + +#ifdef CONFIG_PPC_SUBPAGE_PROT +static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) +{ + return &ctx->spt; +} +#endif + /* * The current system page and segment sizes */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index f620adef54fcd..c503e2f05e611 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -181,6 +181,7 @@ #ifdef CONFIG_PPC_MM_SLICES #include #define SLICE_ARRAY_SIZE (1 << (32 - SLICE_LOW_SHIFT - 1)) +#define LOW_SLICE_ARRAY_SZ SLICE_ARRAY_SIZE #endif #ifndef __ASSEMBLY__ @@ -207,6 +208,55 @@ typedef struct { void *pte_frag; } mm_context_t; +#ifdef CONFIG_PPC_MM_SLICES +static inline u16 mm_ctx_user_psize(mm_context_t *ctx) +{ + return ctx->user_psize; +} + +static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) +{ + ctx->user_psize = user_psize; +} + +static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) +{ + return ctx->low_slices_psize; +} + +static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) +{ + return ctx->high_slices_psize; +} + +static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) +{ + return ctx->slb_addr_limit; +} + +static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) +{ + ctx->slb_addr_limit = limit; +} + +static inline struct slice_mask *mm_ctx_slice_mask_base(mm_context_t *ctx) +{ + return &ctx->mask_base_psize; +} + +#ifdef CONFIG_HUGETLB_PAGE +static inline struct slice_mask *mm_ctx_slice_mask_512k(mm_context_t *ctx) +{ + return &ctx->mask_512k; +} + +static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx) +{ + return &ctx->mask_8m; +} +#endif +#endif /* CONFIG_PPC_MM_SLICE */ + #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) #define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e7382abee8684..9cc91d03ab621 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -267,12 +267,12 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.slb_addr_limit); - get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; - memcpy(&get_paca()->mm_ctx_low_slices_psize, - &context->low_slices_psize, sizeof(context->low_slices_psize)); - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); + VM_BUG_ON(!mm_ctx_slb_addr_limit(context)); + get_paca()->mm_ctx_slb_addr_limit = mm_ctx_slb_addr_limit(context); + memcpy(&get_paca()->mm_ctx_low_slices_psize, mm_ctx_low_slices(context), + LOW_SLICE_ARRAY_SZ); + memcpy(&get_paca()->mm_ctx_high_slices_psize, mm_ctx_high_slices(context), + TASK_SLICE_ARRAY_SZ(context)); #else /* CONFIG_PPC_MM_SLICES */ get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index c4c9610ce6e3c..fee0270618ac0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1142,7 +1142,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 spp = 0; u32 **sbpm, *sbpp; @@ -1465,7 +1465,7 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) int psize = get_slice_psize(mm, ea); /* We only prefault standard pages for now */ - if (unlikely(psize != mm->context.user_psize)) + if (unlikely(psize != mm_ctx_user_psize(&mm->context))) return false; /* @@ -1544,7 +1544,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Hash it in */ #ifdef CONFIG_PPC_64K_PAGES - if (mm->context.user_psize == MMU_PAGE_64K) + if (mm_ctx_user_psize(&mm->context) == MMU_PAGE_64K) rc = __hash_page_64K(ea, access, vsid, ptep, trap, update_flags, ssize); else @@ -1557,8 +1557,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, */ if (rc == -1) hash_failure_debug(ea, access, vsid, trap, ssize, - mm->context.user_psize, - mm->context.user_psize, + mm_ctx_user_psize(&mm->context), + mm_ctx_user_psize(&mm->context), pte_val(*ptep)); out_exit: local_irq_restore(flags); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5986df48359b0..78c0c0a0e3555 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -739,7 +739,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) * consider this as bad access if we take a SLB miss * on an address above addr limit. */ - if (ea >= mm->context.slb_addr_limit) + if (ea >= mm_ctx_slb_addr_limit(&mm->context)) return -EFAULT; context = get_user_context(&mm->context, ea); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index aec91dbcdc0b4..35b2780823916 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -101,7 +101,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->context.slb_addr_limit - len) < addr) + if ((mm_ctx_slb_addr_limit(&mm->context) - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -155,15 +155,15 @@ static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) { #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) - return &mm->context.mask_64k; + return mm_ctx_slice_mask_64k(&mm->context); #endif if (psize == MMU_PAGE_4K) - return &mm->context.mask_4k; + return mm_ctx_slice_mask_4k(&mm->context); #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_16M) - return &mm->context.mask_16m; + return mm_ctx_slice_mask_16m(&mm->context); if (psize == MMU_PAGE_16G) - return &mm->context.mask_16g; + return mm_ctx_slice_mask_16g(&mm->context); #endif BUG(); } @@ -253,7 +253,7 @@ static void slice_convert(struct mm_struct *mm, */ spin_lock_irqsave(&slice_convert_lock, flags); - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); for (i = 0; i < SLICE_NUM_LOW; i++) { if (!(mask->low_slices & (1u << i))) continue; @@ -272,8 +272,8 @@ static void slice_convert(struct mm_struct *mm, (((unsigned long)psize) << (mask_index * 4)); } - hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { + hpsizes = mm_ctx_high_slices(&mm->context); + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm_ctx_slb_addr_limit(&mm->context)); i++) { if (!test_bit(i, mask->high_slices)) continue; @@ -292,8 +292,8 @@ static void slice_convert(struct mm_struct *mm, } slice_dbg(" lsps=%lx, hsps=%lx\n", - (unsigned long)mm->context.low_slices_psize, - (unsigned long)mm->context.high_slices_psize); + (unsigned long)mm_ctx_low_slices(&mm->context), + (unsigned long)mm_ctx_high_slices(&mm->context)); spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -393,7 +393,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * DEFAULT_MAP_WINDOW we should apply this. */ if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; + addr += mm_ctx_slb_addr_limit(&mm->context) - DEFAULT_MAP_WINDOW; while (addr > min_addr) { info.high_limit = addr; @@ -505,20 +505,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, return -ENOMEM; } - if (high_limit > mm->context.slb_addr_limit) { + if (high_limit > mm_ctx_slb_addr_limit(&mm->context)) { /* * Increasing the slb_addr_limit does not require * slice mask cache to be recalculated because it should * be already initialised beyond the old address limit. */ - mm->context.slb_addr_limit = high_limit; + mm_ctx_set_slb_addr_limit(&mm->context, high_limit); on_each_cpu(slice_flush_segments, mm, 1); } /* Sanity checks */ BUG_ON(mm->task_size == 0); - BUG_ON(mm->context.slb_addr_limit == 0); + BUG_ON(mm_ctx_slb_addr_limit(&mm->context) == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); @@ -696,7 +696,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags) { return slice_get_unmapped_area(addr, len, flags, - current->mm->context.user_psize, 0); + mm_ctx_user_psize(¤t->mm->context), 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -706,7 +706,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags) { return slice_get_unmapped_area(addr0, len, flags, - current->mm->context.user_psize, 1); + mm_ctx_user_psize(¤t->mm->context), 1); } unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -717,10 +717,10 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) VM_BUG_ON(radix_enabled()); if (slice_addr_is_low(addr)) { - psizes = mm->context.low_slices_psize; + psizes = mm_ctx_low_slices(&mm->context); index = GET_LOW_SLICE_INDEX(addr); } else { - psizes = mm->context.high_slices_psize; + psizes = mm_ctx_high_slices(&mm->context); index = GET_HIGH_SLICE_INDEX(addr); } mask_index = index & 0x1; @@ -742,20 +742,19 @@ void slice_init_new_context_exec(struct mm_struct *mm) * duplicated. */ #ifdef CONFIG_PPC64 - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64); #else mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; #endif - - mm->context.user_psize = psize; + mm_ctx_set_user_psize(&mm->context, psize); /* * Set all slice psizes to the default. */ - lpsizes = mm->context.low_slices_psize; + lpsizes = mm_ctx_low_slices(&mm->context); memset(lpsizes, (psize << 4) | psize, SLICE_NUM_LOW >> 1); - hpsizes = mm->context.high_slices_psize; + hpsizes = mm_ctx_high_slices(&mm->context); memset(hpsizes, (psize << 4) | psize, SLICE_NUM_HIGH >> 1); /* @@ -777,7 +776,7 @@ void slice_setup_new_exec(void) if (!is_32bit_task()) return; - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; + mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW); } #endif @@ -816,7 +815,7 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { const struct slice_mask *maskp; - unsigned int psize = mm->context.user_psize; + unsigned int psize = mm_ctx_user_psize(&mm->context); VM_BUG_ON(radix_enabled()); diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 5e4178790deef..c72252542210d 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -25,7 +25,7 @@ */ void subpage_prot_free(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); unsigned long i, j, addr; u32 **p; @@ -52,7 +52,7 @@ void subpage_prot_free(struct mm_struct *mm) void subpage_prot_init_new_context(struct mm_struct *mm) { - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); memset(spt, 0, sizeof(*spt)); } @@ -93,7 +93,7 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 **spm, *spp; unsigned long i; size_t nw; @@ -189,7 +189,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = &mm->context.spt; + struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); u32 **spm, *spp; unsigned long i; size_t nw; From 67fda38f0d688580a916a8f829afd8edaffadfcf Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:49 +0530 Subject: [PATCH 049/205] powerpc/mm: Move slb_addr_linit to early_init_mmu Avoid #ifdef in generic code. Also enables us to do this specific to MMU translation mode on book3s64 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 11 ----------- arch/powerpc/mm/hash_utils_64.c | 2 ++ arch/powerpc/mm/tlb_nohash.c | 6 ++++++ 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e5dfb6e08239..a07de8608484c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -947,17 +947,6 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; -#ifdef CONFIG_PPC_MM_SLICES -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; -#elif defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#else -#error "context.addr_limit not initialized." -#endif -#endif - #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fee0270618ac0..fe2fc43a8664b 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1036,6 +1036,8 @@ void __init hash__early_init_mmu(void) */ htab_initialize(); + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ slb_initialize(); diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ac23dc1c65352..088e0a6b5adea 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -800,5 +800,11 @@ void __init early_init_mmu(void) #ifdef CONFIG_PPC_47x early_init_mmu_47x(); #endif + +#ifdef CONFIG_PPC_MM_SLICES +#if defined(CONFIG_PPC_8xx) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif +#endif } #endif /* CONFIG_PPC64 */ From 701101865f5d3e268281ce7a254eb4a97d16cbdc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:50 +0530 Subject: [PATCH 050/205] powerpc/mm: Reduce memory usage for mm_context_t for radix Currently, our mm_context_t on book3s64 include all hash specific context details like slice mask and subpage protection details. We can skip allocating these with radix translation. This will help us to save 8K per mm_context with radix translation. With the patch applied we have sizeof(mm_context_t) = 136 sizeof(struct hash_mm_context) = 8288 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 33 ++++++++++++- arch/powerpc/include/asm/book3s/64/mmu.h | 49 +++++-------------- arch/powerpc/kernel/setup-common.c | 6 +++ arch/powerpc/mm/hash_utils_64.c | 4 +- arch/powerpc/mm/mmu_context_book3s64.c | 16 +++++- 5 files changed, 68 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index eb36fbfe4ef5d..4481bedbb5bea 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -658,7 +658,7 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #define LOW_SLICE_ARRAY_SZ (BITS_PER_LONG / BITS_PER_BYTE) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->slb_addr_limit >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->hash_context->slb_addr_limit >> 41) #ifndef __ASSEMBLY__ #ifdef CONFIG_PPC_SUBPAGE_PROT @@ -693,6 +693,37 @@ static inline void subpage_prot_free(struct mm_struct *mm) {} static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ +/* + * One bit per slice. We have lower slices which cover 256MB segments + * upto 4G range. That gets us 16 low slices. For the rest we track slices + * in 1TB size. + */ +struct slice_mask { + u64 low_slices; + DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); +}; + +struct hash_mm_context { + u16 user_psize; /* page size index */ + + /* SLB page size encodings*/ + unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long slb_addr_limit; +#ifdef CONFIG_PPC_64K_PAGES + struct slice_mask mask_64k; +#endif + struct slice_mask mask_4k; +#ifdef CONFIG_HUGETLB_PAGE + struct slice_mask mask_16m; + struct slice_mask mask_16g; +#endif + +#ifdef CONFIG_PPC_SUBPAGE_PROT + struct subpage_prot_table spt; +#endif /* CONFIG_PPC_SUBPAGE_PROT */ +}; + #if 0 /* * The code below is equivalent to this function for arguments diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index c9f3170906203..e510e46b07ced 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -104,16 +104,6 @@ struct spinlock; /* Maximum possible number of NPUs in a system. */ #define NV_MAX_NPUS 8 -/* - * One bit per slice. We have lower slices which cover 256MB segments - * upto 4G range. That gets us 16 low slices. For the rest we track slices - * in 1TB size. - */ -struct slice_mask { - u64 low_slices; - DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); -}; - typedef struct { union { /* @@ -127,7 +117,6 @@ typedef struct { mm_context_id_t id; mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE]; }; - u16 user_psize; /* page size index */ /* Number of bits in the mm_cpumask */ atomic_t active_cpus; @@ -137,23 +126,9 @@ typedef struct { /* NPU NMMU context */ struct npu_context *npu_context; + struct hash_mm_context *hash_context; - /* SLB page size encodings*/ - unsigned char low_slices_psize[LOW_SLICE_ARRAY_SZ]; - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long slb_addr_limit; -# ifdef CONFIG_PPC_64K_PAGES - struct slice_mask mask_64k; -# endif - struct slice_mask mask_4k; -# ifdef CONFIG_HUGETLB_PAGE - struct slice_mask mask_16m; - struct slice_mask mask_16g; -# endif unsigned long vdso_base; -#ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; -#endif /* CONFIG_PPC_SUBPAGE_PROT */ /* * pagetable fragment support */ @@ -176,62 +151,62 @@ typedef struct { static inline u16 mm_ctx_user_psize(mm_context_t *ctx) { - return ctx->user_psize; + return ctx->hash_context->user_psize; } static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize) { - ctx->user_psize = user_psize; + ctx->hash_context->user_psize = user_psize; } static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx) { - return ctx->low_slices_psize; + return ctx->hash_context->low_slices_psize; } static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx) { - return ctx->high_slices_psize; + return ctx->hash_context->high_slices_psize; } static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx) { - return ctx->slb_addr_limit; + return ctx->hash_context->slb_addr_limit; } static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit) { - ctx->slb_addr_limit = limit; + ctx->hash_context->slb_addr_limit = limit; } #ifdef CONFIG_PPC_64K_PAGES static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx) { - return &ctx->mask_64k; + return &ctx->hash_context->mask_64k; } #endif static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx) { - return &ctx->mask_4k; + return &ctx->hash_context->mask_4k; } #ifdef CONFIG_HUGETLB_PAGE static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx) { - return &ctx->mask_16m; + return &ctx->hash_context->mask_16m; } static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) { - return &ctx->mask_16g; + return &ctx->hash_context->mask_16g; } #endif #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { - return &ctx->spt; + return &ctx->hash_context->spt; } #endif diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index a07de8608484c..21b1ce200b22d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -947,6 +947,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; +#ifdef CONFIG_PPC_MM_SLICES +#if defined(CONFIG_PPC_8xx) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; +#endif +#endif + #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fe2fc43a8664b..27239a0767735 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -963,6 +963,7 @@ void __init hash__early_init_devtree(void) htab_scan_page_sizes(); } +struct hash_mm_context init_hash_mm_context; void __init hash__early_init_mmu(void) { #ifndef CONFIG_PPC_64K_PAGES @@ -1036,7 +1037,8 @@ void __init hash__early_init_mmu(void) */ htab_initialize(); - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + init_mm.context.hash_context = &init_hash_mm_context; + init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index f720c5cc0b5ec..6eef5a36b2e93 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -63,6 +63,12 @@ static int hash__init_new_context(struct mm_struct *mm) if (index < 0) return index; + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL); + if (!mm->context.hash_context) { + ida_free(&mmu_context_ida, index); + return -ENOMEM; + } + /* * The old code would re-promote on fork, we don't do that when using * slices as it could cause problem promoting slices that have been @@ -77,8 +83,14 @@ static int hash__init_new_context(struct mm_struct *mm) * We should not be calling init_new_context() on init_mm. Hence a * check against 0 is OK. */ - if (mm->context.id == 0) + if (mm->context.id == 0) { + memset(mm->context.hash_context, 0, sizeof(struct hash_mm_context)); slice_init_new_context_exec(mm); + } else { + /* This is fork. Copy hash_context details from current->mm */ + memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); + + } subpage_prot_init_new_context(mm); @@ -118,6 +130,7 @@ static int radix__init_new_context(struct mm_struct *mm) asm volatile("ptesync;isync" : : : "memory"); mm->context.npu_context = NULL; + mm->context.hash_context = NULL; return index; } @@ -162,6 +175,7 @@ static void destroy_contexts(mm_context_t *ctx) if (context_id) ida_free(&mmu_context_ida, context_id); } + kfree(ctx->hash_context); } static void pmd_frag_destroy(void *pmd_frag) From ef629cc5bf0543eb57d6e344ba776880ac35fd00 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:33:51 +0530 Subject: [PATCH 051/205] powerc/mm/hash: Reduce hash_mm_context size Allocate subpage protect related variables only if we use the feature. This helps in reducing the hash related mm context struct by around 4K Before the patch sizeof(struct hash_mm_context) = 8288 After the patch sizeof(struct hash_mm_context) = 4160 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 4 +-- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/mm/hash_utils_64.c | 3 ++ arch/powerpc/mm/mmu_context_book3s64.c | 17 +++++++++-- arch/powerpc/mm/subpage-prot.c | 28 ++++++++++++++----- 5 files changed, 40 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 4481bedbb5bea..eeb40091b46be 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -687,10 +687,8 @@ struct subpage_prot_table { #define SBP_L3_SHIFT (SBP_L2_SHIFT + SBP_L2_BITS) extern void subpage_prot_free(struct mm_struct *mm); -extern void subpage_prot_init_new_context(struct mm_struct *mm); #else static inline void subpage_prot_free(struct mm_struct *mm) {} -static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ /* @@ -720,7 +718,7 @@ struct hash_mm_context { #endif #ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; + struct subpage_prot_table *spt; #endif /* CONFIG_PPC_SUBPAGE_PROT */ }; diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index e510e46b07ced..230a9dec76774 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -206,7 +206,7 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { - return &ctx->hash_context->spt; + return ctx->hash_context->spt; } #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 27239a0767735..6a2d315495a3c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1150,6 +1150,9 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) u32 spp = 0; u32 **sbpm, *sbpp; + if (!spt) + return 0; + if (ea >= spt->maxaddr) return 0; if (ea < 0x100000000UL) { diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 6eef5a36b2e93..cb2b08635508b 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -63,7 +63,8 @@ static int hash__init_new_context(struct mm_struct *mm) if (index < 0) return index; - mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), GFP_KERNEL); + mm->context.hash_context = kmalloc(sizeof(struct hash_mm_context), + GFP_KERNEL); if (!mm->context.hash_context) { ida_free(&mmu_context_ida, index); return -ENOMEM; @@ -89,11 +90,21 @@ static int hash__init_new_context(struct mm_struct *mm) } else { /* This is fork. Copy hash_context details from current->mm */ memcpy(mm->context.hash_context, current->mm->context.hash_context, sizeof(struct hash_mm_context)); +#ifdef CONFIG_PPC_SUBPAGE_PROT + /* inherit subpage prot detalis if we have one. */ + if (current->mm->context.hash_context->spt) { + mm->context.hash_context->spt = kmalloc(sizeof(struct subpage_prot_table), + GFP_KERNEL); + if (!mm->context.hash_context->spt) { + ida_free(&mmu_context_ida, index); + kfree(mm->context.hash_context); + return -ENOMEM; + } + } +#endif } - subpage_prot_init_new_context(mm); - pkey_mm_init(mm); return index; } diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index c72252542210d..c9dff4e1f295c 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -29,6 +29,9 @@ void subpage_prot_free(struct mm_struct *mm) unsigned long i, j, addr; u32 **p; + if (!spt) + return; + for (i = 0; i < 4; ++i) { if (spt->low_prot[i]) { free_page((unsigned long)spt->low_prot[i]); @@ -48,13 +51,7 @@ void subpage_prot_free(struct mm_struct *mm) free_page((unsigned long)p); } spt->maxaddr = 0; -} - -void subpage_prot_init_new_context(struct mm_struct *mm) -{ - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); - - memset(spt, 0, sizeof(*spt)); + kfree(spt); } static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, @@ -99,6 +96,9 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) size_t nw; unsigned long next, limit; + if (!spt) + return ; + down_write(&mm->mmap_sem); limit = addr + len; if (limit > spt->maxaddr) @@ -218,6 +218,20 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, return -EFAULT; down_write(&mm->mmap_sem); + + if (!spt) { + /* + * Allocate subpage prot table if not already done. + * Do this with mmap_sem held + */ + spt = kzalloc(sizeof(struct subpage_prot_table), GFP_KERNEL); + if (!spt) { + err = -ENOMEM; + goto out; + } + mm->context.hash_context->spt = spt; + } + subpage_mark_vma_nohuge(mm, addr, len); for (limit = addr + len; addr < limit; addr = next) { next = pmd_addr_end(addr, limit); From a35a3c6f60657812366fca86a9ce71df1b8f7aff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:13 +0530 Subject: [PATCH 052/205] powerpc/mm/hash64: Add a variable to track the end of IO mapping This makes it easy to update the region mapping in the later patch Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 3 ++- arch/powerpc/include/asm/book3s/64/pgtable.h | 8 +++++--- arch/powerpc/include/asm/book3s/64/radix.h | 1 + arch/powerpc/mm/hash_utils_64.c | 1 + arch/powerpc/mm/pgtable-radix.c | 1 + arch/powerpc/mm/pgtable_64.c | 2 ++ 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 54b7af6cd27f3..8cbc4106d4493 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -69,7 +69,8 @@ #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_END (H_KERN_VIRT_START + H_KERN_VIRT_SIZE) /* * Region IDs diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index e3d18b3f6e5d4..f8ab18f77d1ba 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -277,9 +277,12 @@ extern unsigned long __vmalloc_end; extern unsigned long __kernel_virt_start; extern unsigned long __kernel_virt_size; extern unsigned long __kernel_io_start; +extern unsigned long __kernel_io_end; #define KERN_VIRT_START __kernel_virt_start #define KERN_VIRT_SIZE __kernel_virt_size #define KERN_IO_START __kernel_io_start +#define KERN_IO_END __kernel_io_end + extern struct page *vmemmap; extern unsigned long ioremap_bot; extern unsigned long pci_io_base; @@ -296,8 +299,7 @@ extern unsigned long pci_io_base; #include /* - * The second half of the kernel virtual space is used for IO mappings, - * it's itself carved into the PIO region (ISA and PHB IO space) and + * IO space itself carved into the PIO region (ISA and PHB IO space) and * the ioremap space * * ISA_IO_BASE = KERN_IO_START, 64K reserved area @@ -310,7 +312,7 @@ extern unsigned long pci_io_base; #define PHB_IO_BASE (ISA_IO_END) #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) -#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) +#define IOREMAP_END (KERN_IO_END) /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 5ab134eeed20c..6d760a083d621 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -111,6 +111,7 @@ #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) #define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) +#define RADIX_KERN_IO_END (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE) #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6a2d315495a3c..f6ffe8545717f 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1013,6 +1013,7 @@ void __init hash__early_init_mmu(void) __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; __kernel_io_start = H_KERN_IO_START; + __kernel_io_end = H_KERN_IO_END; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 45869fd698a0c..4c1a9843d0f28 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -605,6 +605,7 @@ void __init radix__early_init_mmu(void) __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; __kernel_io_start = RADIX_KERN_IO_START; + __kernel_io_end = RADIX_KERN_IO_END; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index fb1375c07e8c7..7cea39bdf05f8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -98,6 +98,8 @@ unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); +unsigned long __kernel_io_end; +EXPORT_SYMBOL(__kernel_io_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; From 0034d395f89d9c092bb15adbabdca5283e258b41 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:14 +0530 Subject: [PATCH 053/205] powerpc/mm/hash64: Map all the kernel regions in the same 0xc range This patch maps vmalloc, IO and vmemap regions in the 0xc address range instead of the current 0xd and 0xf range. This brings the mapping closer to radix translation mode. With hash 64K page size each of this region is 512TB whereas with 4K config we are limited by the max page table range of 64TB and hence there regions are of 16TB size. The kernel mapping is now: On 4K hash kernel_region_map_size = 16TB kernel vmalloc start = 0xc000100000000000 kernel IO start = 0xc000200000000000 kernel vmemmap start = 0xc000300000000000 64K hash, 64K radix and 4k radix: kernel_region_map_size = 512TB kernel vmalloc start = 0xc008000000000000 kernel IO start = 0xc00a000000000000 kernel vmemmap start = 0xc00c000000000000 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 13 +++ arch/powerpc/include/asm/book3s/64/hash-64k.h | 11 +++ arch/powerpc/include/asm/book3s/64/hash.h | 95 ++++++++++++------- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 31 +++--- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/book3s/64/radix.h | 41 ++++---- arch/powerpc/include/asm/page.h | 3 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 +- arch/powerpc/mm/copro_fault.c | 14 ++- arch/powerpc/mm/hash_utils_64.c | 26 ++--- arch/powerpc/mm/pgtable-radix.c | 7 +- arch/powerpc/mm/pgtable_64.c | 2 - arch/powerpc/mm/ptdump/hashpagetable.c | 2 +- arch/powerpc/mm/ptdump/ptdump.c | 3 +- arch/powerpc/mm/slb.c | 22 +++-- arch/powerpc/platforms/cell/spu_base.c | 4 +- drivers/misc/cxl/fault.c | 2 +- drivers/misc/ocxl/link.c | 2 +- 18 files changed, 172 insertions(+), 109 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 54fab723a8c74..4c9dfd6254616 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -13,6 +13,19 @@ */ #define MAX_EA_BITS_PER_CONTEXT 46 +/* + * Our page table limit us to 64TB. Hence for the kernel mapping, + * each MAP area is limited to 16 TB. + * The four map areas are: linear mapping, vmap, IO and vmemmap + */ +#define H_KERN_MAP_SIZE (ASM_CONST(1) << (MAX_EA_BITS_PER_CONTEXT - 2)) + +/* + * Define the address range of the kernel non-linear virtual area + * 16TB + */ +#define H_KERN_VIRT_START ASM_CONST(0xc000100000000000) + #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 81f4eb6e7da4c..0d0191cda0507 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -14,6 +14,17 @@ */ #define MAX_EA_BITS_PER_CONTEXT 49 +/* + * We use one context for each MAP area. + */ +#define H_KERN_MAP_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) + +/* + * Define the address range of the kernel non-linear virtual area + * 2PB + */ +#define H_KERN_VIRT_START ASM_CONST(0xc008000000000000) + /* * 64k aligned address free up few of the lower bits of RPN for us * We steal that here. For more deatils look at pte_pfn/pfn_pte() diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 8cbc4106d4493..76741a2219108 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -29,6 +29,10 @@ #define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) #define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) +/* + * Top 2 bits are ignored in page table walk. + */ +#define EA_MASK (~(0xcUL << 60)) /* * We store the slot details in the second half of page table. @@ -42,53 +46,56 @@ #endif /* - * Define the address range of the kernel non-linear virtual area. In contrast - * to the linear mapping, this is managed using the kernel page tables and then - * inserted into the hash page table to actually take effect, similarly to user - * mappings. + * +------------------------------+ + * | | + * | | + * | | + * +------------------------------+ Kernel virtual map end (0xc00e000000000000) + * | | + * | | + * | 512TB/16TB of vmemmap | + * | | + * | | + * +------------------------------+ Kernel vmemmap start + * | | + * | 512TB/16TB of IO map | + * | | + * +------------------------------+ Kernel IO map start + * | | + * | 512TB/16TB of vmap | + * | | + * +------------------------------+ Kernel virt start (0xc008000000000000) + * | | + * | | + * | | + * +------------------------------+ Kernel linear (0xc.....) */ -#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -/* - * Allow virtual mapping of one context size. - * 512TB for 64K page size - * 64TB for 4K page size - */ -#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE H_KERN_MAP_SIZE +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -/* - * 8TB IO mapping size - */ -#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */ - -/* - * The vmalloc space starts at the beginning of the kernel non-linear virtual - * region, and occupies 504T (64K) or 56T (4K) - */ -#define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) -#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_SIZE H_KERN_MAP_SIZE +#define H_KERN_IO_END (H_KERN_IO_START + H_KERN_IO_SIZE) -#define H_KERN_IO_START H_VMALLOC_END -#define H_KERN_IO_END (H_KERN_VIRT_START + H_KERN_VIRT_SIZE) +#define H_VMEMMAP_START H_KERN_IO_END +#define H_VMEMMAP_SIZE H_KERN_MAP_SIZE +#define H_VMEMMAP_END (H_VMEMMAP_START + H_VMEMMAP_SIZE) /* * Region IDs */ -#define REGION_SHIFT 60UL -#define REGION_MASK (0xfUL << REGION_SHIFT) -#define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) - -#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) -#define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define VMEMMAP_REGION_ID (0xfUL) /* Server only */ -#define USER_REGION_ID (0UL) +#define USER_REGION_ID 1 +#define KERNEL_REGION_ID 2 +#define VMALLOC_REGION_ID 3 +#define IO_REGION_ID 4 +#define VMEMMAP_REGION_ID 5 /* * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA @@ -104,6 +111,26 @@ #define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) #ifndef __ASSEMBLY__ +static inline int get_region_id(unsigned long ea) +{ + int id = (ea >> 60UL); + + if (id == 0) + return USER_REGION_ID; + + VM_BUG_ON(id != 0xc); + VM_BUG_ON(ea >= H_VMEMMAP_END); + + if (ea >= H_VMEMMAP_START) + return VMEMMAP_REGION_ID; + else if (ea >= H_KERN_IO_START) + return IO_REGION_ID; + else if (ea >= H_VMALLOC_START) + return VMALLOC_REGION_ID; + + return KERNEL_REGION_ID; +} + #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) static inline int hash__pgd_bad(pgd_t pgd) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index eeb40091b46be..8a30bf189f109 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -588,7 +588,8 @@ extern void slb_set_size(u16 size); #endif #define MAX_VMALLOC_CTX_CNT 1 -#define MAX_MEMMAP_CTX_CNT 1 +#define MAX_IO_CTX_CNT 1 +#define MAX_VMEMMAP_CTX_CNT 1 /* * 256MB segment @@ -601,13 +602,10 @@ extern void slb_set_size(u16 size); * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 * because of the modulo operation in vsid scramble. * - * We add one extra context to MIN_USER_CONTEXT so that we can map kernel - * context easily. The +1 is to map the unused 0xe region mapping. */ #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) #define MIN_USER_CONTEXT (MAX_KERNEL_CTX_CNT + MAX_VMALLOC_CTX_CNT + \ - MAX_MEMMAP_CTX_CNT + 2) - + MAX_IO_CTX_CNT + MAX_VMEMMAP_CTX_CNT) /* * For platforms that support on 65bit VA we limit the context bits */ @@ -776,7 +774,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, /* * Bad address. We return VSID 0 for that */ - if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) + if ((ea & EA_MASK) >= H_PGTABLE_RANGE) return 0; if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) @@ -803,28 +801,29 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, * 0x00002 - [ 0xc002000000000000 - 0xc003ffffffffffff] * 0x00003 - [ 0xc004000000000000 - 0xc005ffffffffffff] * 0x00004 - [ 0xc006000000000000 - 0xc007ffffffffffff] - - * 0x00005 - [ 0xd000000000000000 - 0xd001ffffffffffff ] - * 0x00006 - Not used - Can map 0xe000000000000000 range. - * 0x00007 - [ 0xf000000000000000 - 0xf001ffffffffffff ] * - * So we can compute the context from the region (top nibble) by - * subtracting 11, or 0xc - 1. + * vmap, IO, vmemap + * + * 0x00005 - [ 0xc008000000000000 - 0xc009ffffffffffff] + * 0x00006 - [ 0xc00a000000000000 - 0xc00bffffffffffff] + * 0x00007 - [ 0xc00c000000000000 - 0xc00dffffffffffff] + * */ static inline unsigned long get_kernel_context(unsigned long ea) { - unsigned long region_id = REGION_ID(ea); + unsigned long region_id = get_region_id(ea); unsigned long ctx; /* - * For linear mapping we do support multiple context + * Depending on Kernel config, kernel region can have one context + * or more. */ if (region_id == KERNEL_REGION_ID) { /* * We already verified ea to be not beyond the addr limit. */ - ctx = 1 + ((ea & ~REGION_MASK) >> MAX_EA_BITS_PER_CONTEXT); + ctx = 1 + ((ea & EA_MASK) >> MAX_EA_BITS_PER_CONTEXT); } else - ctx = (region_id - 0xc) + MAX_KERNEL_CTX_CNT; + ctx = region_id + MAX_KERNEL_CTX_CNT - 2; return ctx; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f8ab18f77d1ba..7dede2e34b708 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -279,7 +279,6 @@ extern unsigned long __kernel_virt_size; extern unsigned long __kernel_io_start; extern unsigned long __kernel_io_end; #define KERN_VIRT_START __kernel_virt_start -#define KERN_VIRT_SIZE __kernel_virt_size #define KERN_IO_START __kernel_io_start #define KERN_IO_END __kernel_io_end diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 6d760a083d621..574eca33f8930 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -72,19 +72,17 @@ * | | * | | * | | - * +------------------------------+ Kernel IO map end (0xc010000000000000) + * +------------------------------+ Kernel vmemmap end (0xc010000000000000) * | | + * | 512TB | * | | - * | 1/2 of virtual map | + * +------------------------------+ Kernel IO map end/vmemap start * | | + * | 512TB | * | | - * +------------------------------+ Kernel IO map start + * +------------------------------+ Kernel vmap end/ IO map start * | | - * | 1/4 of virtual map | - * | | - * +------------------------------+ Kernel vmemap start - * | | - * | 1/4 of virtual map | + * | 512TB | * | | * +------------------------------+ Kernel virt start (0xc008000000000000) * | | @@ -93,25 +91,24 @@ * +------------------------------+ Kernel linear (0xc.....) */ -#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) -#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) - +#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) /* - * The vmalloc space starts at the beginning of that region, and - * occupies a quarter of it on radix config. - * (we keep a quarter for the virtual memmap) + * 49 = MAX_EA_BITS_PER_CONTEXT (hash specific). To make sure we pick + * the same value as hash. */ +#define RADIX_KERN_MAP_SIZE (1UL << 49) + #define RADIX_VMALLOC_START RADIX_KERN_VIRT_START -#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) +#define RADIX_VMALLOC_SIZE RADIX_KERN_MAP_SIZE #define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) -/* - * Defines the address of the vmemap area, in its own region on - * hash table CPUs. - */ -#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) -#define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) -#define RADIX_KERN_IO_END (RADIX_KERN_VIRT_START + RADIX_KERN_VIRT_SIZE) +#define RADIX_KERN_IO_START RADIX_VMALLOC_END +#define RADIX_KERN_IO_SIZE RADIX_KERN_MAP_SIZE +#define RADIX_KERN_IO_END (RADIX_KERN_IO_START + RADIX_KERN_IO_SIZE) + +#define RADIX_VMEMMAP_START RADIX_KERN_IO_END +#define RADIX_VMEMMAP_SIZE RADIX_KERN_MAP_SIZE +#define RADIX_VMEMMAP_END (RADIX_VMEMMAP_START + RADIX_VMEMMAP_SIZE) #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ed870468ef6f1..918228f2205ba 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -139,7 +139,8 @@ static inline bool pfn_valid(unsigned long pfn) * return true for some vmalloc addresses, which is incorrect. So explicitly * check that the address is in the kernel region. */ -#define virt_addr_valid(kaddr) (REGION_ID(kaddr) == KERNEL_REGION_ID && \ +/* may be can drop get_region_id */ +#define virt_addr_valid(kaddr) (get_region_id((unsigned long)kaddr) == KERNEL_REGION_ID && \ pfn_valid(virt_to_pfn(kaddr))) #else #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 3b9662a4207e0..085509148d95f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -822,7 +822,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr) raddr = per_cpu_ptr(addr, cpu); l = (unsigned long)raddr; - if (REGION_ID(l) == VMALLOC_REGION_ID) { + if (get_region_id(l) == VMALLOC_REGION_ID) { l = vmalloc_to_phys(raddr); raddr = (unsigned int *)l; } diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index c8da352e8686c..9b0321061bc87 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -105,7 +105,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) u64 vsid, vsidkey; int psize, ssize; - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); if (mm == NULL) @@ -117,10 +117,14 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) break; case VMALLOC_REGION_ID: pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; + break; + case IO_REGION_ID: + pr_devel("%s: 0x%llx -- IO_REGION_ID\n", __func__, ea); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); vsidkey = SLB_VSID_KERNEL; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f6ffe8545717f..9c4ae4aa133e5 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1009,12 +1009,11 @@ void __init hash__early_init_mmu(void) __pgd_val_bits = HASH_PGD_VAL_BITS; __kernel_virt_start = H_KERN_VIRT_START; - __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; __kernel_io_start = H_KERN_IO_START; __kernel_io_end = H_KERN_IO_END; - vmemmap = (struct page *)H_VMEMMAP_BASE; + vmemmap = (struct page *)H_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI @@ -1241,7 +1240,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, trace_hash_fault(ea, access, trap); /* Get region & vsid */ - switch (REGION_ID(ea)) { + switch (get_region_id(ea)) { case USER_REGION_ID: user_region = 1; if (! mm) { @@ -1255,10 +1254,13 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, break; case VMALLOC_REGION_ID: vsid = get_kernel_vsid(ea, mmu_kernel_ssize); - if (ea < VMALLOC_END) - psize = mmu_vmalloc_psize; - else - psize = mmu_io_psize; + psize = mmu_vmalloc_psize; + ssize = mmu_kernel_ssize; + break; + + case IO_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + psize = mmu_io_psize; ssize = mmu_kernel_ssize; break; default: @@ -1424,7 +1426,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, unsigned long flags = 0; struct mm_struct *mm = current->mm; - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((get_region_id(ea) == VMALLOC_REGION_ID) || + (get_region_id(ea) == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1440,8 +1443,9 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; + unsigned int region_id = get_region_id(ea); - if (REGION_ID(ea) == VMALLOC_REGION_ID) + if ((region_id == VMALLOC_REGION_ID) || (region_id == IO_REGION_ID)) mm = &init_mm; if (dsisr & DSISR_NOHPTE) @@ -1458,7 +1462,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, * 2) user space access kernel space. */ access |= _PAGE_PRIVILEGED; - if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) + if ((msr & MSR_PR) || (region_id == USER_REGION_ID)) access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) @@ -1502,7 +1506,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); - BUG_ON(REGION_ID(ea) != USER_REGION_ID); + BUG_ON(get_region_id(ea) != USER_REGION_ID); if (!should_hash_preload(mm, ea)) return; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 4c1a9843d0f28..4d9fa9e900d5b 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -136,6 +136,10 @@ static int __map_kernel_page(unsigned long ea, unsigned long pa, */ BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); +#ifdef CONFIG_PPC_64K_PAGES + BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); +#endif + if (unlikely(!slab_is_available())) return early_map_kernel_page(ea, pa, flags, map_page_size, nid, region_start, region_end); @@ -601,12 +605,11 @@ void __init radix__early_init_mmu(void) __pgd_val_bits = RADIX_PGD_VAL_BITS; __kernel_virt_start = RADIX_KERN_VIRT_START; - __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; __kernel_io_start = RADIX_KERN_IO_START; __kernel_io_end = RADIX_KERN_IO_END; - vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + vmemmap = (struct page *)RADIX_VMEMMAP_START; ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PCI diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 7cea39bdf05f8..56068cac2a3cf 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -90,8 +90,6 @@ unsigned long __pgd_val_bits; EXPORT_SYMBOL(__pgd_val_bits); unsigned long __kernel_virt_start; EXPORT_SYMBOL(__kernel_virt_start); -unsigned long __kernel_virt_size; -EXPORT_SYMBOL(__kernel_virt_size); unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index b430e4e08af69..b9bda0105841d 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -500,7 +500,7 @@ static void populate_markers(void) address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; #ifdef CONFIG_PPC_BOOK3S_64 - address_markers[9].start_address = H_VMEMMAP_BASE; + address_markers[9].start_address = H_VMEMMAP_START; #else address_markers[9].start_address = VMEMMAP_BASE; #endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 37138428ab558..63fc56feea15b 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -303,8 +303,9 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; + /* What is the ifdef about? */ #ifdef CONFIG_PPC_BOOK3S_64 - address_markers[i++].start_address = H_VMEMMAP_BASE; + address_markers[i++].start_address = H_VMEMMAP_START; #else address_markers[i++].start_address = VMEMMAP_BASE; #endif diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 78c0c0a0e3555..721cb09c9044b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -694,7 +694,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) if (id == KERNEL_REGION_ID) { /* We only support upto MAX_PHYSMEM_BITS */ - if ((ea & ~REGION_MASK) > (1UL << MAX_PHYSMEM_BITS)) + if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_linear_psize].sllp; @@ -702,20 +702,25 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) #ifdef CONFIG_SPARSEMEM_VMEMMAP } else if (id == VMEMMAP_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMEMMAP_END) return -EFAULT; flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmemmap_psize].sllp; #endif } else if (id == VMALLOC_REGION_ID) { - if ((ea & ~REGION_MASK) >= (1ULL << MAX_EA_BITS_PER_CONTEXT)) + if (ea >= H_VMALLOC_END) return -EFAULT; - if (ea < H_VMALLOC_END) - flags = local_paca->vmalloc_sllp; - else - flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + flags = local_paca->vmalloc_sllp; + + } else if (id == IO_REGION_ID) { + + if (ea >= H_KERN_IO_END) + return -EFAULT; + + flags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_io_psize].sllp; + } else { return -EFAULT; } @@ -725,6 +730,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) ssize = MMU_SEGSIZE_256M; context = get_kernel_context(ea); + return slb_insert_entry(ea, context, flags, ssize, true); } @@ -761,7 +767,7 @@ static long slb_allocate_user(struct mm_struct *mm, unsigned long ea) long do_slb_fault(struct pt_regs *regs, unsigned long ea) { - unsigned long id = REGION_ID(ea); + unsigned long id = get_region_id(ea); /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 7f12c7b78c0f6..4770cce1bfe20 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -194,7 +194,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) * faults need to be deferred to process context. */ if ((dsisr & MFC_DSISR_PTE_NOT_FOUND) && - (REGION_ID(ea) != USER_REGION_ID)) { + (get_region_id(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); ret = hash_page(ea, @@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb) unsigned long ea = (unsigned long)addr; u64 llp; - if (REGION_ID(ea) == KERNEL_REGION_ID) + if (get_region_id(ea) == KERNEL_REGION_ID) llp = mmu_psize_defs[mmu_linear_psize].sllp; else llp = mmu_psize_defs[mmu_virtual_psize].sllp; diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index dc7b34174f857..a4d17a5a97635 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -168,7 +168,7 @@ int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, u64 dar) if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_WRITE; - if (!mm && (REGION_ID(dar) != USER_REGION_ID)) + if (!mm && (get_region_id(dar) != USER_REGION_ID)) access |= _PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index d50b861d7e57b..04ec3d74f828c 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -163,7 +163,7 @@ static void xsl_fault_handler_bh(struct work_struct *fault_work) if (fault->dsisr & SPA_XSL_S) access |= _PAGE_WRITE; - if (REGION_ID(fault->dar) != USER_REGION_ID) + if (get_region_id(fault->dar) != USER_REGION_ID) access |= _PAGE_PRIVILEGED; local_irq_save(flags); From e09093927e54323018dbb3bf6189c85a7f176bae Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:15 +0530 Subject: [PATCH 054/205] powerpc/mm: Validate address values against different region limits This adds an explicit check in various functions. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 18 +++++++++++++++--- arch/powerpc/mm/pgtable-hash64.c | 13 ++++++++++--- arch/powerpc/mm/pgtable-radix.c | 16 ++++++++++++++++ arch/powerpc/mm/pgtable_64.c | 5 +++++ 4 files changed, 46 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 9c4ae4aa133e5..f727197de7139 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -781,9 +781,16 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size) int hash__create_section_mapping(unsigned long start, unsigned long end, int nid) { - int rc = htab_bolt_mapping(start, end, __pa(start), - pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize); + int rc; + + if (end >= H_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, end, mmu_linear_psize, @@ -924,6 +931,11 @@ static void __init htab_initialize(void) DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", base, size, prot); + if ((base + size) >= H_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + continue; + } + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index c08d49046a968..d934de4e2b3a1 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -112,9 +112,16 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) { - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); + int rc; + + if ((start + page_size) >= H_VMEMMAP_END) { + pr_warn("Outisde the supported range\n"); + return -1; + } + + rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); if (rc < 0) { int rc2 = htab_remove_mapping(start, start + page_size, mmu_vmemmap_psize, diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 4d9fa9e900d5b..e6d5065b0bc81 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -339,6 +339,12 @@ void __init radix_init_pgtable(void) * page tables will be allocated within the range. No * need or a node (which we don't have yet). */ + + if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + continue; + } + WARN_ON(create_physical_mapping(reg->base, reg->base + reg->size, -1)); @@ -895,6 +901,11 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { + if (end >= RADIX_VMALLOC_START) { + pr_warn("Outisde the supported range\n"); + return -1; + } + return create_physical_mapping(start, end, nid); } @@ -922,6 +933,11 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); int ret; + if ((start + page_size) >= RADIX_VMEMMAP_END) { + pr_warn("Outisde the supported range\n"); + return -1; + } + ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid); BUG_ON(ret); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 56068cac2a3cf..72f58c076e26c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -121,6 +121,11 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; + if ((ea + size) >= (void *)IOREMAP_END) { + pr_warn("Outisde the supported range\n"); + return NULL; + } + WARN_ON(pa & ~PAGE_MASK); WARN_ON(((unsigned long)ea) & ~PAGE_MASK); WARN_ON(size & ~PAGE_MASK); From 53ed7a5947de2e19c270a0bc0c29257c6d004b0f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:16 +0530 Subject: [PATCH 055/205] powerpc/mm: Drop the unnecessary region check All the regions are now mapped with top nibble 0xc. Hence the region id check is not needed for virt_addr_valid() Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/page.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 918228f2205ba..748f5db2e2b74 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -132,19 +132,7 @@ static inline bool pfn_valid(unsigned long pfn) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_PPC_BOOK3S_64 -/* - * On hash the vmalloc and other regions alias to the kernel region when passed - * through __pa(), which virt_to_pfn() uses. That means virt_addr_valid() can - * return true for some vmalloc addresses, which is incorrect. So explicitly - * check that the address is in the kernel region. - */ -/* may be can drop get_region_id */ -#define virt_addr_valid(kaddr) (get_region_id((unsigned long)kaddr) == KERNEL_REGION_ID && \ - pfn_valid(virt_to_pfn(kaddr))) -#else #define virt_addr_valid(kaddr) pfn_valid(virt_to_pfn(kaddr)) -#endif /* * On Book-E parts we need __va to parse the device tree and we can't From 1c946c1b7f2ba40bc9b521219ad34e5da3fc3088 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:17 +0530 Subject: [PATCH 056/205] powerpc/mm/hash: Simplify the region id calculation. This reduces multiple comparisons in get_region_id to a bit shift operation. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 4 ++- arch/powerpc/include/asm/book3s/64/hash-64k.h | 1 + arch/powerpc/include/asm/book3s/64/hash.h | 31 +++++++++---------- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 4c9dfd6254616..8fd8599c9395c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -13,12 +13,14 @@ */ #define MAX_EA_BITS_PER_CONTEXT 46 +#define REGION_SHIFT (MAX_EA_BITS_PER_CONTEXT - 2) + /* * Our page table limit us to 64TB. Hence for the kernel mapping, * each MAP area is limited to 16 TB. * The four map areas are: linear mapping, vmap, IO and vmemmap */ -#define H_KERN_MAP_SIZE (ASM_CONST(1) << (MAX_EA_BITS_PER_CONTEXT - 2)) +#define H_KERN_MAP_SIZE (ASM_CONST(1) << REGION_SHIFT) /* * Define the address range of the kernel non-linear virtual area diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0d0191cda0507..d1d9177d9ebdd 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -13,6 +13,7 @@ * is handled in the hotpath. */ #define MAX_EA_BITS_PER_CONTEXT 49 +#define REGION_SHIFT MAX_EA_BITS_PER_CONTEXT /* * We use one context for each MAP area. diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 76741a2219108..7faa3d7214c04 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -83,26 +83,26 @@ #define H_VMEMMAP_SIZE H_KERN_MAP_SIZE #define H_VMEMMAP_END (H_VMEMMAP_START + H_VMEMMAP_SIZE) +#define NON_LINEAR_REGION_ID(ea) ((((unsigned long)ea - H_KERN_VIRT_START) >> REGION_SHIFT) + 2) + /* * Region IDs */ -#define USER_REGION_ID 1 -#define KERNEL_REGION_ID 2 -#define VMALLOC_REGION_ID 3 -#define IO_REGION_ID 4 -#define VMEMMAP_REGION_ID 5 +#define USER_REGION_ID 0 +#define KERNEL_REGION_ID 1 +#define VMALLOC_REGION_ID NON_LINEAR_REGION_ID(H_VMALLOC_START) +#define IO_REGION_ID NON_LINEAR_REGION_ID(H_KERN_IO_START) +#define VMEMMAP_REGION_ID NON_LINEAR_REGION_ID(H_VMEMMAP_START) /* * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ - #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #endif /* CONFIG_PPC_MM_SLICES */ - /* PTEIDX nibble */ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 @@ -113,22 +113,21 @@ #ifndef __ASSEMBLY__ static inline int get_region_id(unsigned long ea) { + int region_id; int id = (ea >> 60UL); if (id == 0) return USER_REGION_ID; - VM_BUG_ON(id != 0xc); - VM_BUG_ON(ea >= H_VMEMMAP_END); + if (ea < H_KERN_VIRT_START) + return KERNEL_REGION_ID; - if (ea >= H_VMEMMAP_START) - return VMEMMAP_REGION_ID; - else if (ea >= H_KERN_IO_START) - return IO_REGION_ID; - else if (ea >= H_VMALLOC_START) - return VMALLOC_REGION_ID; + VM_BUG_ON(id != 0xc); + BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2); - return KERNEL_REGION_ID; + region_id = NON_LINEAR_REGION_ID(ea); + VM_BUG_ON(region_id > VMEMMAP_REGION_ID); + return region_id; } #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 8a30bf189f109..9a9adbeef0706 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -823,7 +823,7 @@ static inline unsigned long get_kernel_context(unsigned long ea) */ ctx = 1 + ((ea & EA_MASK) >> MAX_EA_BITS_PER_CONTEXT); } else - ctx = region_id + MAX_KERNEL_CTX_CNT - 2; + ctx = region_id + MAX_KERNEL_CTX_CNT - 1; return ctx; } From a092a03fa942b14369b8edea7690cd96206403f9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:18 +0530 Subject: [PATCH 057/205] powerpc/mm: Print kernel map details to dmesg This helps in debugging. We can look at the dmesg to find out different kernel mapping details. On 4K config this shows kernel vmalloc start = 0xc000100000000000 kernel IO start = 0xc000200000000000 kernel vmemmap start = 0xc000300000000000 On 64K config: kernel vmalloc start = 0xc008000000000000 kernel IO start = 0xc00a000000000000 kernel vmemmap start = 0xc00c000000000000 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 21b1ce200b22d..1729bf409562c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -831,6 +831,9 @@ static __init void print_system_info(void) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); + pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); + pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); #endif #ifdef CONFIG_PPC_BOOK3S_32 if (Hash) From 5f53d28608f600d9ee07378453bd2d49e132fff4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 17 Apr 2019 18:29:19 +0530 Subject: [PATCH 058/205] powerpc/mm/hash: Rename KERNEL_REGION_ID to LINEAR_MAP_REGION_ID The region actually point to linear map. Rename the #define to clarify thati. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hash.h | 4 ++-- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/mm/copro_fault.c | 4 ++-- arch/powerpc/mm/slb.c | 4 ++-- arch/powerpc/platforms/cell/spu_base.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 7faa3d7214c04..1d1183048cfd7 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -89,7 +89,7 @@ * Region IDs */ #define USER_REGION_ID 0 -#define KERNEL_REGION_ID 1 +#define LINEAR_MAP_REGION_ID 1 #define VMALLOC_REGION_ID NON_LINEAR_REGION_ID(H_VMALLOC_START) #define IO_REGION_ID NON_LINEAR_REGION_ID(H_KERN_IO_START) #define VMEMMAP_REGION_ID NON_LINEAR_REGION_ID(H_VMEMMAP_START) @@ -120,7 +120,7 @@ static inline int get_region_id(unsigned long ea) return USER_REGION_ID; if (ea < H_KERN_VIRT_START) - return KERNEL_REGION_ID; + return LINEAR_MAP_REGION_ID; VM_BUG_ON(id != 0xc); BUILD_BUG_ON(NON_LINEAR_REGION_ID(H_VMALLOC_START) != 2); diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 9a9adbeef0706..1e4705516a54f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -817,7 +817,7 @@ static inline unsigned long get_kernel_context(unsigned long ea) * Depending on Kernel config, kernel region can have one context * or more. */ - if (region_id == KERNEL_REGION_ID) { + if (region_id == LINEAR_MAP_REGION_ID) { /* * We already verified ea to be not beyond the addr limit. */ diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 9b0321061bc87..f137286740cb9 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -129,8 +129,8 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) vsid = get_kernel_vsid(ea, mmu_kernel_ssize); vsidkey = SLB_VSID_KERNEL; break; - case KERNEL_REGION_ID: - pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); + case LINEAR_MAP_REGION_ID: + pr_devel("%s: 0x%llx -- LINEAR_MAP_REGION_ID\n", __func__, ea); psize = mmu_linear_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 721cb09c9044b..89e4531de64b4 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -691,7 +691,7 @@ static long slb_allocate_kernel(unsigned long ea, unsigned long id) unsigned long flags; int ssize; - if (id == KERNEL_REGION_ID) { + if (id == LINEAR_MAP_REGION_ID) { /* We only support upto MAX_PHYSMEM_BITS */ if ((ea & EA_MASK) > (1UL << MAX_PHYSMEM_BITS)) @@ -790,7 +790,7 @@ long do_slb_fault(struct pt_regs *regs, unsigned long ea) * first class kernel code. But for performance it's probably nicer * if they go via fast_exception_return too. */ - if (id >= KERNEL_REGION_ID) { + if (id >= LINEAR_MAP_REGION_ID) { long err; #ifdef CONFIG_DEBUG_VM /* Catch recursive kernel SLB faults. */ diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 4770cce1bfe20..6646f152d57bc 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -224,7 +224,7 @@ static void __spu_kernel_slb(void *addr, struct copro_slb *slb) unsigned long ea = (unsigned long)addr; u64 llp; - if (get_region_id(ea) == KERNEL_REGION_ID) + if (get_region_id(ea) == LINEAR_MAP_REGION_ID) llp = mmu_psize_defs[mmu_linear_psize].sllp; else llp = mmu_psize_defs[mmu_virtual_psize].sllp; From 26ad26718dfaa7cf49d106d212ebf2370076c253 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 30 Mar 2019 11:13:45 +0530 Subject: [PATCH 059/205] powerpc/mm: Fix section mismatch warning This patch fix the below section mismatch warnings. WARNING: vmlinux.o(.text+0x2d1f44): Section mismatch in reference from the function devm_memremap_pages_release() to the function .meminit.text:arch_remove_memory() WARNING: vmlinux.o(.text+0x2d265c): Section mismatch in reference from the function devm_memremap_pages() to the function .meminit.text:arch_add_memory() Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3665602a9dfa7..e12bec98366fa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -131,8 +131,8 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap * } #ifdef CONFIG_MEMORY_HOTREMOVE -int __meminit arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +int __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; From f341d89790b0b7f99ca7835e0cf7de1026ceae39 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Apr 2019 16:10:17 +0100 Subject: [PATCH 060/205] powerpc/mm: fix spelling mistake "Outisde" -> "Outside" There are several identical spelling mistakes in warning messages, fix these. Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 4 ++-- arch/powerpc/mm/pgtable-hash64.c | 2 +- arch/powerpc/mm/pgtable-radix.c | 6 +++--- arch/powerpc/mm/pgtable_64.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f727197de7139..6eb89643ce583 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -784,7 +784,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid int rc; if (end >= H_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } @@ -932,7 +932,7 @@ static void __init htab_initialize(void) base, size, prot); if ((base + size) >= H_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); continue; } diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index d934de4e2b3a1..097a3b3538b15 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -115,7 +115,7 @@ int __meminit hash__vmemmap_create_mapping(unsigned long start, int rc; if ((start + page_size) >= H_VMEMMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index e6d5065b0bc81..fcb0169e2d324 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -341,7 +341,7 @@ void __init radix_init_pgtable(void) */ if ((reg->base + reg->size) >= RADIX_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); continue; } @@ -902,7 +902,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid) { if (end >= RADIX_VMALLOC_START) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } @@ -934,7 +934,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, int ret; if ((start + page_size) >= RADIX_VMEMMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return -1; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 72f58c076e26c..95ad2a09501c8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -122,7 +122,7 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ return NULL; if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outisde the supported range\n"); + pr_warn("Outside the supported range\n"); return NULL; } From b2d3b5ee66f2a04a918cc043cec0c9ed3de58f40 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Tue, 2 Oct 2018 10:35:59 -0500 Subject: [PATCH 061/205] powerpc/pseries: Track LMB nid instead of using device tree When removing memory we need to remove the memory from the node it was added to instead of looking up the node it should be in in the device tree. During testing we have seen scenarios where the affinity for a LMB changes due to a partition migration or PRRN event. In these cases the node the LMB exists in may not match the node the device tree indicates it belongs in. This can lead to a system crash when trying to DLPAR remove the LMB after a migration or PRRN event. The current code looks up the node in the device tree to remove the LMB from, the crash occurs when we try to offline this node and it does not have any data, i.e. node_data[nid] == NULL. 36:mon> e cpu 0x36: Vector: 300 (Data Access) at [c0000001828b7810] pc: c00000000036d08c: try_offline_node+0x2c/0x1b0 lr: c0000000003a14ec: remove_memory+0xbc/0x110 sp: c0000001828b7a90 msr: 800000000280b033 dar: 9a28 dsisr: 40000000 current = 0xc0000006329c4c80 paca = 0xc000000007a55200 softe: 0 irq_happened: 0x01 pid = 76926, comm = kworker/u320:3 36:mon> t [link register ] c0000000003a14ec remove_memory+0xbc/0x110 [c0000001828b7a90] c00000000006a1cc arch_remove_memory+0x9c/0xd0 (unreliable) [c0000001828b7ad0] c0000000003a14e0 remove_memory+0xb0/0x110 [c0000001828b7b20] c0000000000c7db4 dlpar_remove_lmb+0x94/0x160 [c0000001828b7b60] c0000000000c8ef8 dlpar_memory+0x7e8/0xd10 [c0000001828b7bf0] c0000000000bf828 handle_dlpar_errorlog+0xf8/0x160 [c0000001828b7c60] c0000000000bf8cc pseries_hp_work_fn+0x3c/0xa0 [c0000001828b7c90] c000000000128cd8 process_one_work+0x298/0x5a0 [c0000001828b7d20] c000000000129068 worker_thread+0x88/0x620 [c0000001828b7dc0] c00000000013223c kthread+0x1ac/0x1c0 [c0000001828b7e30] c00000000000b45c ret_from_kernel_thread+0x5c/0x80 To resolve this we need to track the node a LMB belongs to when it is added to the system so we can remove it from that node instead of the node that the device tree indicates it should belong to. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/drmem.h | 21 +++++++++++++++++++ arch/powerpc/mm/drmem.c | 6 +++++- .../platforms/pseries/hotplug-memory.c | 17 +++++++-------- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 7c1d8e74b25d4..7f3279b014db0 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -17,6 +17,9 @@ struct drmem_lmb { u32 drc_index; u32 aa_index; u32 flags; +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; +#endif }; struct drmem_lmb_info { @@ -104,4 +107,22 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) lmb->aa_index = 0xffffffff; } +#ifdef CONFIG_MEMORY_HOTPLUG +static inline void lmb_set_nid(struct drmem_lmb *lmb) +{ + lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr); +} +static inline void lmb_clear_nid(struct drmem_lmb *lmb) +{ + lmb->nid = -1; +} +#else +static inline void lmb_set_nid(struct drmem_lmb *lmb) +{ +} +static inline void lmb_clear_nid(struct drmem_lmb *lmb) +{ +} +#endif + #endif /* _ASM_POWERPC_LMB_H */ diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 3f1803672c9bb..641891df2046c 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -366,8 +366,10 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) if (!drmem_info->lmbs) return; - for_each_drmem_lmb(lmb) + for_each_drmem_lmb(lmb) { read_drconf_v1_cell(lmb, &prop); + lmb_set_nid(lmb); + } } static void __init init_drmem_v2_lmbs(const __be32 *prop) @@ -412,6 +414,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) lmb->aa_index = dr_cell.aa_index; lmb->flags = dr_cell.flags; + + lmb_set_nid(lmb); } } } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index d291b618a559d..47087832f8b2e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -379,7 +379,7 @@ static int dlpar_add_lmb(struct drmem_lmb *); static int dlpar_remove_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int nid, rc; + int rc; if (!lmb_is_removable(lmb)) return -EINVAL; @@ -389,14 +389,14 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) return rc; block_sz = pseries_memory_block_size(); - nid = memory_add_physaddr_to_nid(lmb->base_addr); - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->nid, lmb->base_addr, block_sz); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); + lmb_clear_nid(lmb); lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; @@ -653,7 +653,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index) static int dlpar_add_lmb(struct drmem_lmb *lmb) { unsigned long block_sz; - int nid, rc; + int rc; if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; @@ -664,13 +664,11 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) return rc; } + lmb_set_nid(lmb); block_sz = memory_block_size_bytes(); - /* Find the node id for this address */ - nid = memory_add_physaddr_to_nid(lmb->base_addr); - /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz); + rc = __add_memory(lmb->nid, lmb->base_addr, block_sz); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; @@ -678,8 +676,9 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - __remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(lmb->nid, lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); + lmb_clear_nid(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; } From 7ae3f6e130e8dc6188b59e3b4ebc2f16e9c8d053 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 9 Apr 2019 14:40:05 +1000 Subject: [PATCH 062/205] powerpc/watchdog: Use hrtimers for per-CPU heartbeat Using a jiffies timer creates a dependency on the tick_do_timer_cpu incrementing jiffies. If that CPU has locked up and jiffies is not incrementing, the watchdog heartbeat timer for all CPUs stops and creates false positives and confusing warnings on local CPUs, and also causes the SMP detector to stop, so the root cause is never detected. Fix this by using hrtimer based timers for the watchdog heartbeat, like the generic kernel hardlockup detector. Cc: Gautham R. Shenoy Reported-by: Ravikumar Bangoria Signed-off-by: Nicholas Piggin Tested-by: Ravi Bangoria Reported-by: Ravi Bangoria Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/watchdog.c | 81 +++++++++++++++++----------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 3c6ab22a0c4e3..af3c15a1d41eb 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -77,7 +77,7 @@ static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ -static DEFINE_PER_CPU(struct timer_list, wd_timer); +static DEFINE_PER_CPU(struct hrtimer, wd_hrtimer); static DEFINE_PER_CPU(u64, wd_timer_tb); /* SMP checker bits */ @@ -293,21 +293,21 @@ void soft_nmi_interrupt(struct pt_regs *regs) nmi_exit(); } -static void wd_timer_reset(unsigned int cpu, struct timer_list *t) -{ - t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms); - if (wd_timer_period_ms > 1000) - t->expires = __round_jiffies_up(t->expires, cpu); - add_timer_on(t, cpu); -} - -static void wd_timer_fn(struct timer_list *t) +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { int cpu = smp_processor_id(); + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return HRTIMER_NORESTART; + + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return HRTIMER_NORESTART; + watchdog_timer_interrupt(cpu); - wd_timer_reset(cpu, t); + hrtimer_forward_now(hrtimer, ms_to_ktime(wd_timer_period_ms)); + + return HRTIMER_RESTART; } void arch_touch_nmi_watchdog(void) @@ -323,37 +323,22 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); -static void start_watchdog_timer_on(unsigned int cpu) -{ - struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); - - per_cpu(wd_timer_tb, cpu) = get_tb(); - - timer_setup(t, wd_timer_fn, TIMER_PINNED); - wd_timer_reset(cpu, t); -} - -static void stop_watchdog_timer_on(unsigned int cpu) -{ - struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); - - del_timer_sync(t); -} - -static int start_wd_on_cpu(unsigned int cpu) +static void start_watchdog(void *arg) { + struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer); + int cpu = smp_processor_id(); unsigned long flags; if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { WARN_ON(1); - return 0; + return; } if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - return 0; + return; if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) - return 0; + return; wd_smp_lock(&flags); cpumask_set_cpu(cpu, &wd_cpus_enabled); @@ -363,27 +348,40 @@ static int start_wd_on_cpu(unsigned int cpu) } wd_smp_unlock(&flags); - start_watchdog_timer_on(cpu); + *this_cpu_ptr(&wd_timer_tb) = get_tb(); - return 0; + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + hrtimer_start(hrtimer, ms_to_ktime(wd_timer_period_ms), + HRTIMER_MODE_REL_PINNED); } -static int stop_wd_on_cpu(unsigned int cpu) +static int start_watchdog_on_cpu(unsigned int cpu) { + return smp_call_function_single(cpu, start_watchdog, NULL, true); +} + +static void stop_watchdog(void *arg) +{ + struct hrtimer *hrtimer = this_cpu_ptr(&wd_hrtimer); + int cpu = smp_processor_id(); unsigned long flags; if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) - return 0; /* Can happen in CPU unplug case */ + return; /* Can happen in CPU unplug case */ - stop_watchdog_timer_on(cpu); + hrtimer_cancel(hrtimer); wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_cpus_enabled); wd_smp_unlock(&flags); wd_smp_clear_cpu_pending(cpu, get_tb()); +} - return 0; +static int stop_watchdog_on_cpu(unsigned int cpu) +{ + return smp_call_function_single(cpu, stop_watchdog, NULL, true); } static void watchdog_calc_timeouts(void) @@ -402,7 +400,7 @@ void watchdog_nmi_stop(void) int cpu; for_each_cpu(cpu, &wd_cpus_enabled) - stop_wd_on_cpu(cpu); + stop_watchdog_on_cpu(cpu); } void watchdog_nmi_start(void) @@ -411,7 +409,7 @@ void watchdog_nmi_start(void) watchdog_calc_timeouts(); for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) - start_wd_on_cpu(cpu); + start_watchdog_on_cpu(cpu); } /* @@ -423,7 +421,8 @@ int __init watchdog_nmi_probe(void) err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", - start_wd_on_cpu, stop_wd_on_cpu); + start_watchdog_on_cpu, + stop_watchdog_on_cpu); if (err < 0) { pr_warn("could not be initialized"); return err; From 10d91611f426d4bafd2a83d966c36da811b2f7ad Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 13 Apr 2019 00:30:52 +1000 Subject: [PATCH 063/205] powerpc/64s: Reimplement book3s idle code in C Reimplement Book3S idle code in C, moving POWER7/8/9 implementation speific HV idle code to the powernv platform code. Book3S assembly stubs are kept in common code and used only to save the stack frame and non-volatile GPRs before executing architected idle instructions, and restoring the stack and reloading GPRs then returning to C after waking from idle. The complex logic dealing with threads and subcores, locking, SPRs, HMIs, timebase resync, etc., is all done in C which makes it more maintainable. This is not a strict translation to C code, there are some significant differences: - Idle wakeup no longer uses the ->cpu_restore call to reinit SPRs, but saves and restores them itself. - The optimisation where EC=ESL=0 idle modes did not have to save GPRs or change MSR is restored, because it's now simple to do. ESL=1 sleeps that do not lose GPRs can use this optimization too. - KVM secondary entry and cede is now more of a call/return style rather than branchy. nap_state_lost is not required because KVM always returns via NVGPR restoring path. - KVM secondary wakeup from offline sequence is moved entirely into the offline wakeup, which avoids a hwsync in the normal idle wakeup path. Performance measured with context switch ping-pong on different threads or cores, is possibly improved a small amount, 1-3% depending on stop state and core vs thread test for shallow states. Deep states it's in the noise compared with other latencies. KVM improvements: - Idle sleepers now always return to caller rather than branch out to KVM first. - This allows optimisations like very fast return to caller when no state has been lost. - KVM no longer requires nap_state_lost because it controls NVGPR save/restore itself on the way in and out. - The heavy idle wakeup KVM request check can be moved out of the normal host idle code and into the not-performance-critical offline code. - KVM nap code now returns from where it is called, which makes the flow a bit easier to follow. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin [mpe: Squash the KVM changes in] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cpuidle.h | 19 +- arch/powerpc/include/asm/paca.h | 40 +- arch/powerpc/include/asm/processor.h | 9 +- arch/powerpc/include/asm/reg.h | 8 +- arch/powerpc/kernel/asm-offsets.c | 18 - arch/powerpc/kernel/exceptions-64s.S | 23 +- arch/powerpc/kernel/idle_book3s.S | 1060 +++------------------- arch/powerpc/kernel/setup-common.c | 4 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 118 ++- arch/powerpc/platforms/powernv/idle.c | 862 ++++++++++++++---- arch/powerpc/platforms/powernv/subcore.c | 2 +- arch/powerpc/xmon/xmon.c | 24 +- 12 files changed, 969 insertions(+), 1218 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 43e5f31fe64d1..9844b3ded187c 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -27,10 +27,11 @@ * the THREAD_WINKLE_BITS are set, which indicate which threads have not * yet woken from the winkle state. */ -#define PNV_CORE_IDLE_LOCK_BIT 0x10000000 +#define NR_PNV_CORE_IDLE_LOCK_BIT 28 +#define PNV_CORE_IDLE_LOCK_BIT (1ULL << NR_PNV_CORE_IDLE_LOCK_BIT) +#define PNV_CORE_IDLE_WINKLE_COUNT_SHIFT 16 #define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000 -#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000 #define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00 @@ -68,16 +69,6 @@ #define ERR_DEEP_STATE_ESL_MISMATCH -2 #ifndef __ASSEMBLY__ -/* Additional SPRs that need to be saved/restored during stop */ -struct stop_sprs { - u64 pid; - u64 ldbar; - u64 fscr; - u64 hfscr; - u64 mmcr1; - u64 mmcr2; - u64 mmcra; -}; #define PNV_IDLE_NAME_LEN 16 struct pnv_idle_states_t { @@ -92,10 +83,6 @@ struct pnv_idle_states_t { extern struct pnv_idle_states_t *pnv_idle_states; extern int nr_pnv_idle_states; -extern u32 pnv_fastsleep_workaround_at_entry[]; -extern u32 pnv_fastsleep_workaround_at_exit[]; - -extern u64 pnv_first_deep_stop_state; unsigned long pnv_cpu_offline(unsigned int cpu); int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index e843bc5d1a0f2..245d11a71784a 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -173,7 +173,6 @@ struct paca_struct { u8 irq_happened; /* irq happened while soft-disabled */ u8 io_sync; /* writel() needs spin_unlock sync */ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ - u8 nap_state_lost; /* NV GPR values lost in power7_idle */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE u8 pmcregs_in_use; /* pseries puts this in lppaca */ #endif @@ -183,23 +182,28 @@ struct paca_struct { #endif #ifdef CONFIG_PPC_POWERNV - /* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */ - u32 *core_idle_state_ptr; - u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */ - /* Mask to indicate thread id in core */ - u8 thread_mask; - /* Mask to denote subcore sibling threads */ - u8 subcore_sibling_mask; - /* Flag to request this thread not to stop */ - atomic_t dont_stop; - /* The PSSCR value that the kernel requested before going to stop */ - u64 requested_psscr; - - /* - * Save area for additional SPRs that need to be - * saved/restored during cpuidle stop. - */ - struct stop_sprs stop_sprs; + /* PowerNV idle fields */ + /* PNV_CORE_IDLE_* bits, all siblings work on thread 0 paca */ + unsigned long idle_state; + union { + /* P7/P8 specific fields */ + struct { + /* PNV_THREAD_RUNNING/NAP/SLEEP */ + u8 thread_idle_state; + /* Mask to denote subcore sibling threads */ + u8 subcore_sibling_mask; + }; + + /* P9 specific fields */ + struct { +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* The PSSCR value that the kernel requested before going to stop */ + u64 requested_psscr; + /* Flag to request this thread not to stop */ + atomic_t dont_stop; +#endif + }; + }; #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 3351bcf42f2db..3120cca72e1f0 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -411,14 +411,17 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32) } #endif +/* asm stubs */ +extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val); +extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val); +extern unsigned long isa206_idle_insn_mayloss(unsigned long type); + extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ -extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/ + extern void power7_idle_type(unsigned long type); -extern unsigned long power9_idle_stop(unsigned long psscr_val); -extern unsigned long power9_offline_stop(unsigned long psscr_val); extern void power9_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c5b2aff0ce8e1..10caa145f98b8 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -168,6 +168,7 @@ #define PSSCR_ESL 0x00200000 /* Enable State Loss */ #define PSSCR_SD 0x00400000 /* Status Disable */ #define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */ +#define PSSCR_PLS_SHIFT 60 #define PSSCR_GUEST_VIS 0xf0000000000003ffUL /* Guest-visible PSSCR fields */ #define PSSCR_FAKE_SUSPEND 0x00000400 /* Fake-suspend bit (P9 DD2.2) */ #define PSSCR_FAKE_SUSPEND_LG 10 /* Fake-suspend bit position */ @@ -758,10 +759,9 @@ #define SRR1_WAKERESET 0x00100000 /* System reset */ #define SRR1_WAKEHDBELL 0x000c0000 /* Hypervisor doorbell on P8 */ #define SRR1_WAKESTATE 0x00030000 /* Powersave exit mask [46:47] */ -#define SRR1_WS_DEEPEST 0x00030000 /* Some resources not maintained, - * may not be recoverable */ -#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ -#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ +#define SRR1_WS_HVLOSS 0x00030000 /* HV resources not maintained */ +#define SRR1_WS_GPRLOSS 0x00020000 /* GPRs not maintained */ +#define SRR1_WS_NOLOSS 0x00010000 /* All resources maintained */ #define SRR1_PROGTM 0x00200000 /* TM Bad Thing */ #define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ #define SRR1_PROGILL 0x00080000 /* Illegal instruction */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 86a61e5f8285b..83ad99f9f05d0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -268,7 +268,6 @@ int main(void) OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime); OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime); OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save); - OFFSET(PACA_NAPSTATELOST, paca_struct, nap_state_lost); OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso); #else /* CONFIG_PPC64 */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE @@ -766,23 +765,6 @@ int main(void) OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl); #endif -#ifdef CONFIG_PPC_POWERNV - OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr); - OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); - OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); - OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); - OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); - OFFSET(PACA_DONT_STOP, paca_struct, dont_stop); -#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) - STOP_SPR(STOP_PID, pid); - STOP_SPR(STOP_LDBAR, ldbar); - STOP_SPR(STOP_FSCR, fscr); - STOP_SPR(STOP_HFSCR, hfscr); - STOP_SPR(STOP_MMCR1, mmcr1); - STOP_SPR(STOP_MMCR2, mmcr2); - STOP_SPR(STOP_MMCRA, mmcra); -#endif - DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a5b8fbae56a03..6247b5bbfa5cf 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -120,7 +120,9 @@ EXC_VIRT_NONE(0x4000, 0x100) mfspr r10,SPRN_SRR1 ; \ rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ - cmpwi cr3,r10,2 ; \ + cmpwi cr1,r10,2 ; \ + mfspr r3,SPRN_SRR1 ; \ + bltlr cr1 ; /* no state loss, return to idle caller */ \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ KVMTEST_PR(n) ; \ @@ -144,8 +146,11 @@ TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) - mfspr r12,SPRN_SRR1 - b pnv_powersave_wakeup + /* + * This must be a direct branch (without linker branch stub) because + * we can not use TOC at this point as r2 may not be restored yet. + */ + b idle_return_gpr_loss #endif /* @@ -427,17 +432,17 @@ EXC_COMMON_BEGIN(machine_check_idle_common) * Then decrement MCE nesting after finishing with the stack. */ ld r3,_MSR(r1) + ld r4,_LINK(r1) lhz r11,PACA_IN_MCE(r13) subi r11,r11,1 sth r11,PACA_IN_MCE(r13) - /* Turn off the RI bit because SRR1 is used by idle wakeup code. */ - /* Recoverability could be improved by reducing the use of SRR1. */ - li r11,0 - mtmsrd r11,1 - - b pnv_powersave_wakeup_mce + mtlr r4 + rlwinm r10,r3,47-31,30,31 + cmpwi cr1,r10,2 + bltlr cr1 /* no state loss, return to idle caller */ + b idle_return_gpr_loss #endif /* * Handle machine check early in real mode. We come here with diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 7f5ac2e8581be..2dfbd5d5b932b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -1,956 +1,188 @@ /* - * This file contains idle entry/exit functions for POWER7, - * POWER8 and POWER9 CPUs. + * Copyright 2018, IBM Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * This file contains general idle entry/exit functions to save + * and restore stack and NVGPRs which allows C code to call idle + * states that lose GPRs, and it will return transparently with + * SRR1 wakeup reason return value. + * + * The platform / CPU caller must ensure SPRs and any other non-GPR + * state is saved and restored correctly, handle KVM, interrupts, etc. */ -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include -#include -#include -#include -#include -#include - -#undef DEBUG - -/* - * Use unused space in the interrupt stack to save and restore - * registers for winkle support. - */ -#define _MMCR0 GPR0 -#define _SDR1 GPR3 -#define _PTCR GPR3 -#define _RPR GPR4 -#define _SPURR GPR5 -#define _PURR GPR6 -#define _TSCR GPR7 -#define _DSCR GPR8 -#define _AMOR GPR9 -#define _WORT GPR10 -#define _WORC GPR11 -#define _LPCR GPR12 - -#define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 - .text - -/* - * Used by threads before entering deep idle states. Saves SPRs - * in interrupt stack frame - */ -save_sprs_to_stack: - /* - * Note all register i.e per-core, per-subcore or per-thread is saved - * here since any thread in the core might wake up first - */ -BEGIN_FTR_SECTION - /* - * Note - SDR1 is dropped in Power ISA v3. Hence not restoring - * SDR1 here - */ - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) - mfspr r3,SPRN_LPCR - std r3,_LPCR(r1) -FTR_SECTION_ELSE - mfspr r3,SPRN_SDR1 - std r3,_SDR1(r1) -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) - mfspr r3,SPRN_RPR - std r3,_RPR(r1) - mfspr r3,SPRN_SPURR - std r3,_SPURR(r1) - mfspr r3,SPRN_PURR - std r3,_PURR(r1) - mfspr r3,SPRN_TSCR - std r3,_TSCR(r1) - mfspr r3,SPRN_DSCR - std r3,_DSCR(r1) - mfspr r3,SPRN_AMOR - std r3,_AMOR(r1) - mfspr r3,SPRN_WORT - std r3,_WORT(r1) - mfspr r3,SPRN_WORC - std r3,_WORC(r1) /* - * On POWER9, there are idle states such as stop4, invoked via cpuidle, - * that lose hypervisor resources. In such cases, we need to save - * additional SPRs before entering those idle states so that they can - * be restored to their older values on wakeup from the idle state. + * Desired PSSCR in r3 * - * On POWER8, the only such deep idle state is winkle which is used - * only in the context of CPU-Hotplug, where these additional SPRs are - * reinitiazed to a sane value. Hence there is no need to save/restore - * these SPRs. + * No state will be lost regardless of wakeup mechanism (interrupt or NIA). + * + * An EC=0 type wakeup will return with a value of 0. SRESET wakeup (which can + * happen with xscom SRESET and possibly MCE) may clobber volatiles except LR, + * and must blr, to return to caller with r3 set according to caller's expected + * return code (for Book3S/64 that is SRR1). */ -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - -power9_save_additional_sprs: - mfspr r3, SPRN_PID - mfspr r4, SPRN_LDBAR - std r3, STOP_PID(r13) - std r4, STOP_LDBAR(r13) - - mfspr r3, SPRN_FSCR - mfspr r4, SPRN_HFSCR - std r3, STOP_FSCR(r13) - std r4, STOP_HFSCR(r13) - - mfspr r3, SPRN_MMCRA - mfspr r4, SPRN_MMCR0 - std r3, STOP_MMCRA(r13) - std r4, _MMCR0(r1) - - mfspr r3, SPRN_MMCR1 - mfspr r4, SPRN_MMCR2 - std r3, STOP_MMCR1(r13) - std r4, STOP_MMCR2(r13) - blr - -power9_restore_additional_sprs: - ld r3,_LPCR(r1) - ld r4, STOP_PID(r13) - mtspr SPRN_LPCR,r3 - mtspr SPRN_PID, r4 - - ld r3, STOP_LDBAR(r13) - ld r4, STOP_FSCR(r13) - mtspr SPRN_LDBAR, r3 - mtspr SPRN_FSCR, r4 - - ld r3, STOP_HFSCR(r13) - ld r4, STOP_MMCRA(r13) - mtspr SPRN_HFSCR, r3 - mtspr SPRN_MMCRA, r4 - - ld r3, _MMCR0(r1) - ld r4, STOP_MMCR1(r13) - mtspr SPRN_MMCR0, r3 - mtspr SPRN_MMCR1, r4 - - ld r3, STOP_MMCR2(r13) - ld r4, PACA_SPRG_VDSO(r13) - mtspr SPRN_MMCR2, r3 - mtspr SPRN_SPRG3, r4 +_GLOBAL(isa300_idle_stop_noloss) + mtspr SPRN_PSSCR,r3 + PPC_STOP + li r3,0 blr /* - * Used by threads when the lock bit of core_idle_state is set. - * Threads will spin in HMT_LOW until the lock bit is cleared. - * r14 - pointer to core_idle_state - * r15 - used to load contents of core_idle_state - * r9 - used as a temporary variable + * Desired PSSCR in r3 + * + * GPRs may be lost, so they are saved here. Wakeup is by interrupt only. + * The SRESET wakeup returns to this function's caller by calling + * idle_return_gpr_loss with r3 set to desired return value. + * + * A wakeup without GPR loss may alteratively be handled as in + * isa300_idle_stop_noloss and blr directly, as an optimisation. + * + * The caller is responsible for saving/restoring SPRs, MSR, timebase, + * etc. */ - -core_idle_lock_held: - HMT_LOW -3: lwz r15,0(r14) - andis. r15,r15,PNV_CORE_IDLE_LOCK_BIT@h - bne 3b - HMT_MEDIUM - lwarx r15,0,r14 - andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h - bne- core_idle_lock_held - blr +_GLOBAL(isa300_idle_stop_mayloss) + mtspr SPRN_PSSCR,r3 + std r1,PACAR1(r13) + mflr r4 + mfcr r5 + /* use stack red zone rather than a new frame for saving regs */ + std r2,-8*0(r1) + std r14,-8*1(r1) + std r15,-8*2(r1) + std r16,-8*3(r1) + std r17,-8*4(r1) + std r18,-8*5(r1) + std r19,-8*6(r1) + std r20,-8*7(r1) + std r21,-8*8(r1) + std r22,-8*9(r1) + std r23,-8*10(r1) + std r24,-8*11(r1) + std r25,-8*12(r1) + std r26,-8*13(r1) + std r27,-8*14(r1) + std r28,-8*15(r1) + std r29,-8*16(r1) + std r30,-8*17(r1) + std r31,-8*18(r1) + std r4,-8*19(r1) + std r5,-8*20(r1) + /* 168 bytes */ + PPC_STOP + b . /* catch bugs */ /* - * Pass requested state in r3: - * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested PSSCR value in POWER9 + * Desired return value in r3 + * + * The idle wakeup SRESET interrupt can call this after calling + * to return to the idle sleep function caller with r3 as the return code. * - * Address of idle handler to branch to in realmode in r4 + * This must not be used if idle was entered via a _noloss function (use + * a simple blr instead). */ -pnv_powersave_common: - /* Use r3 to pass state nap/sleep/winkle */ - /* NAP is a state loss, we create a regs frame on the - * stack, fill it up with the state we care about and - * stick a pointer to it in PACAR1. We really only - * need to save PC, some CR bits and the NV GPRs, - * but for now an interrupt frame will do. - */ - mtctr r4 - - mflr r0 - std r0,16(r1) - stdu r1,-INT_FRAME_SIZE(r1) - std r0,_LINK(r1) - std r0,_NIP(r1) - - /* We haven't lost state ... yet */ - li r0,0 - stb r0,PACA_NAPSTATELOST(r13) - - /* Continue saving state */ - SAVE_GPR(2, r1) - SAVE_NVGPRS(r1) - mfcr r5 - std r5,_CCR(r1) - std r1,PACAR1(r13) - -BEGIN_FTR_SECTION - /* - * POWER9 does not require real mode to stop, and presently does not - * set hwthread_state for KVM (threads don't share MMU context), so - * we can remain in virtual mode for this. - */ - bctr -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - /* - * POWER8 - * Go to real mode to do the nap, as required by the architecture. - * Also, we need to be in real mode before setting hwthread_state, - * because as soon as we do that, another thread can switch - * the MMU context to the guest. - */ - LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - mtmsrd r7,0 - bctr +_GLOBAL(idle_return_gpr_loss) + ld r1,PACAR1(r13) + ld r4,-8*19(r1) + ld r5,-8*20(r1) + mtlr r4 + mtcr r5 + /* + * KVM nap requires r2 to be saved, rather than just restoring it + * from PACATOC. This could be avoided for that less common case + * if KVM saved its r2. + */ + ld r2,-8*0(r1) + ld r14,-8*1(r1) + ld r15,-8*2(r1) + ld r16,-8*3(r1) + ld r17,-8*4(r1) + ld r18,-8*5(r1) + ld r19,-8*6(r1) + ld r20,-8*7(r1) + ld r21,-8*8(r1) + ld r22,-8*9(r1) + ld r23,-8*10(r1) + ld r24,-8*11(r1) + ld r25,-8*12(r1) + ld r26,-8*13(r1) + ld r27,-8*14(r1) + ld r28,-8*15(r1) + ld r29,-8*16(r1) + ld r30,-8*17(r1) + ld r31,-8*18(r1) + blr /* * This is the sequence required to execute idle instructions, as * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. + * + * The 0(r1) slot is used to save r2 in isa206, so use that here. */ #define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ - std r0,0(r1); \ + std r2,0(r1); \ ptesync; \ - ld r0,0(r1); \ -236: cmpd cr0,r0,r0; \ + ld r2,0(r1); \ +236: cmpd cr0,r2,r2; \ bne 236b; \ - IDLE_INST; - - - .globl pnv_enter_arch207_idle_mode -pnv_enter_arch207_idle_mode: -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* Tell KVM we're entering idle */ - li r4,KVM_HWTHREAD_IN_IDLE - /******************************************************/ - /* N O T E W E L L ! ! ! N O T E W E L L */ - /* The following store to HSTATE_HWTHREAD_STATE(r13) */ - /* MUST occur in real mode, i.e. with the MMU off, */ - /* and the MMU must stay off until we clear this flag */ - /* and test HSTATE_HWTHREAD_REQ(r13) in */ - /* pnv_powersave_wakeup in this file. */ - /* The reason is that another thread can switch the */ - /* MMU to a guest context whenever this flag is set */ - /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ - /* that would potentially cause this thread to start */ - /* executing instructions from guest memory in */ - /* hypervisor mode, leading to a host crash or data */ - /* corruption, or worse. */ - /******************************************************/ - stb r4,HSTATE_HWTHREAD_STATE(r13) -#endif - stb r3,PACA_THREAD_IDLE_STATE(r13) - cmpwi cr3,r3,PNV_THREAD_SLEEP - bge cr3,2f - IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) - /* No return */ -2: - /* Sleep or winkle */ - lbz r7,PACA_THREAD_MASK(r13) - ld r14,PACA_CORE_IDLE_STATE_PTR(r13) - li r5,0 - beq cr3,3f - lis r5,PNV_CORE_IDLE_WINKLE_COUNT@h -3: -lwarx_loop1: - lwarx r15,0,r14 - - andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h - bnel- core_idle_lock_held - - add r15,r15,r5 /* Add if winkle */ - andc r15,r15,r7 /* Clear thread bit */ - - andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS - -/* - * If cr0 = 0, then current thread is the last thread of the core entering - * sleep. Last thread needs to execute the hardware bug workaround code if - * required by the platform. - * Make the workaround call unconditionally here. The below branch call is - * patched out when the idle states are discovered if the platform does not - * require it. - */ -.global pnv_fastsleep_workaround_at_entry -pnv_fastsleep_workaround_at_entry: - beq fastsleep_workaround_at_entry - - stwcx. r15,0,r14 - bne- lwarx_loop1 - isync - -common_enter: /* common code for all the threads entering sleep or winkle */ - bgt cr3,enter_winkle - IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) - -fastsleep_workaround_at_entry: - oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h - stwcx. r15,0,r14 - bne- lwarx_loop1 - isync - - /* Fast sleep workaround */ - li r3,1 - li r4,1 - bl opal_config_cpu_idle_state - - /* Unlock */ - xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h - lwsync - stw r15,0(r14) - b common_enter - -enter_winkle: - bl save_sprs_to_stack - - IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) - -/* - * r3 - PSSCR value corresponding to the requested stop state. - */ -power_enter_stop: -/* - * Check if we are executing the lite variant with ESL=EC=0 - */ - andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED - clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ - bne .Lhandle_esl_ec_set - PPC_STOP - li r3,0 /* Since we didn't lose state, return 0 */ - std r3, PACA_REQ_PSSCR(r13) - - /* - * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so - * it can determine if the wakeup reason is an HMI in - * CHECK_HMI_INTERRUPT. - * - * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup - * reason, so there is no point setting r12 to SRR1. - * - * Further, we clear r12 here, so that we don't accidentally enter the - * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. - */ - li r12, 0 - b pnv_wakeup_noloss - -.Lhandle_esl_ec_set: -BEGIN_FTR_SECTION - /* - * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after - * a state-loss idle. Saving and restoring MMCR0 over idle is a - * workaround. - */ - mfspr r4,SPRN_MMCR0 - std r4,_MMCR0(r1) -END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) + IDLE_INST; \ + b . /* catch bugs */ /* - * Check if the requested state is a deep idle state. - */ - LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) - ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - cmpd r3,r4 - bge .Lhandle_deep_stop - PPC_STOP /* Does not return (system reset interrupt) */ - -.Lhandle_deep_stop: -/* - * Entering deep idle state. - * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to - * stack and enter stop - */ - lbz r7,PACA_THREAD_MASK(r13) - ld r14,PACA_CORE_IDLE_STATE_PTR(r13) - -lwarx_loop_stop: - lwarx r15,0,r14 - andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h - bnel- core_idle_lock_held - andc r15,r15,r7 /* Clear thread bit */ - - stwcx. r15,0,r14 - bne- lwarx_loop_stop - isync - - bl save_sprs_to_stack - - PPC_STOP /* Does not return (system reset interrupt) */ - -/* - * Entered with MSR[EE]=0 and no soft-masked interrupts pending. - * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). - */ -_GLOBAL(power7_idle_insn) - /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - -#define CHECK_HMI_INTERRUPT \ -BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ -FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ -ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ - cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne+ 20f; \ - /* Invoke opal call to handle hmi */ \ - ld r2,PACATOC(r13); \ - ld r1,PACAR1(r13); \ - std r3,ORIG_GPR3(r1); /* Save original r3 */ \ - li r3,0; /* NULL argument */ \ - bl hmi_exception_realmode; \ - nop; \ - ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ -20: nop; - -/* - * Entered with MSR[EE]=0 and no soft-masked interrupts pending. - * r3 contains desired PSSCR register value. + * Desired instruction type in r3 * - * Offline (CPU unplug) case also must notify KVM that the CPU is - * idle. - */ -_GLOBAL(power9_offline_stop) -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * Tell KVM we're entering idle. - * This does not have to be done in real mode because the P9 MMU - * is independent per-thread. Some steppings share radix/hash mode - * between threads, but in that case KVM has a barrier sync in real - * mode before and after switching between radix and hash. - */ - li r4,KVM_HWTHREAD_IN_IDLE - stb r4,HSTATE_HWTHREAD_STATE(r13) -#endif - /* fall through */ - -_GLOBAL(power9_idle_stop) - std r3, PACA_REQ_PSSCR(r13) -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -BEGIN_FTR_SECTION - sync - lwz r5, PACA_DONT_STOP(r13) - cmpwi r5, 0 - bne 1f -END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) -#endif - mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r4,power_enter_stop) - b pnv_powersave_common - /* No return */ -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -1: - /* - * We get here when TM / thread reconfiguration bug workaround - * code wants to get the CPU into SMT4 mode, and therefore - * we are being asked not to stop. - */ - li r3, 0 - std r3, PACA_REQ_PSSCR(r13) - blr /* return 0 for wakeup cause / SRR1 value */ -#endif - -/* - * Called from machine check handler for powersave wakeups. - * Low level machine check processing has already been done. Now just - * go through the wake up path to get everything in order. + * GPRs may be lost, so they are saved here. Wakeup is by interrupt only. + * The SRESET wakeup returns to this function's caller by calling + * idle_return_gpr_loss with r3 set to desired return value. * - * r3 - The original SRR1 value. - * Original SRR[01] have been clobbered. - * MSR_RI is clear. - */ -.global pnv_powersave_wakeup_mce -pnv_powersave_wakeup_mce: - /* Set cr3 for pnv_powersave_wakeup */ - rlwinm r11,r3,47-31,30,31 - cmpwi cr3,r11,2 - - /* - * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into r12, which allows reuse of the system reset wakeup - * code without being mistaken for another type of wakeup. - */ - oris r12,r3,SRR1_WAKEMCE_RESVD@h - - b pnv_powersave_wakeup - -/* - * Called from reset vector for powersave wakeups. - * cr3 - set to gt if waking up with partial/complete hypervisor state loss - * r12 - SRR1 - */ -.global pnv_powersave_wakeup -pnv_powersave_wakeup: - ld r2, PACATOC(r13) - -BEGIN_FTR_SECTION - bl pnv_restore_hyp_resource_arch300 -FTR_SECTION_ELSE - bl pnv_restore_hyp_resource_arch207 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) - - li r0,PNV_THREAD_RUNNING - stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ - - mr r3,r12 - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - lbz r0,HSTATE_HWTHREAD_STATE(r13) - cmpwi r0,KVM_HWTHREAD_IN_KERNEL - beq 0f - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync -0: lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - b kvm_start_guest -1: -#endif - - /* Return SRR1 from power7_nap() */ - blt cr3,pnv_wakeup_noloss - b pnv_wakeup_loss - -/* - * Check whether we have woken up with hypervisor state loss. - * If yes, restore hypervisor state and return back to link. + * A wakeup without GPR loss may alteratively be handled as in + * isa300_idle_stop_noloss and blr directly, as an optimisation. * - * cr3 - set to gt if waking up with partial/complete hypervisor state loss - */ -pnv_restore_hyp_resource_arch300: - /* - * Workaround for POWER9, if we lost resources, the ERAT - * might have been mixed up and needs flushing. We also need - * to reload MMCR0 (see comment above). We also need to set - * then clear bit 60 in MMCRA to ensure the PMU starts running. - */ - blt cr3,1f -BEGIN_FTR_SECTION - PPC_INVALIDATE_ERAT - ld r1,PACAR1(r13) - ld r4,_MMCR0(r1) - mtspr SPRN_MMCR0,r4 -END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) - mfspr r4,SPRN_MMCRA - ori r4,r4,(1 << (63-60)) - mtspr SPRN_MMCRA,r4 - xori r4,r4,(1 << (63-60)) - mtspr SPRN_MMCRA,r4 -1: - /* - * POWER ISA 3. Use PSSCR to determine if we - * are waking up from deep idle state - */ - LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) - ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - - /* - * 0-3 bits correspond to Power-Saving Level Status - * which indicates the idle state we are waking up from - */ - mfspr r5, SPRN_PSSCR - rldicl r5,r5,4,60 - li r0, 0 /* clear requested_psscr to say we're awake */ - std r0, PACA_REQ_PSSCR(r13) - cmpd cr4,r5,r4 - bge cr4,pnv_wakeup_tb_loss /* returns to caller */ - - blr /* Waking up without hypervisor state loss. */ - -/* Same calling convention as arch300 */ -pnv_restore_hyp_resource_arch207: - /* - * POWER ISA 2.07 or less. - * Check if we slept with sleep or winkle. - */ - lbz r4,PACA_THREAD_IDLE_STATE(r13) - cmpwi cr2,r4,PNV_THREAD_NAP - bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */ - - /* - * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking - * up from nap. At this stage CR3 shouldn't contains 'gt' since that - * indicates we are waking with hypervisor state loss from nap. - */ - bgt cr3,. - - blr /* Waking up without hypervisor state loss */ - -/* - * Called if waking up from idle state which can cause either partial or - * complete hyp state loss. - * In POWER8, called if waking up from fastsleep or winkle - * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state - * - * r13 - PACA - * cr3 - gt if waking up with partial/complete hypervisor state loss - * - * If ISA300: - * cr4 - gt or eq if waking up from complete hypervisor state loss. + * The caller is responsible for saving/restoring SPRs, MSR, timebase, + * etc. * - * If ISA207: - * r4 - PACA_THREAD_IDLE_STATE + * This must be called in real-mode (MSR_IDLE). */ -pnv_wakeup_tb_loss: - ld r1,PACAR1(r13) - /* - * Before entering any idle state, the NVGPRs are saved in the stack. - * If there was a state loss, or PACA_NAPSTATELOST was set, then the - * NVGPRs are restored. If we are here, it is likely that state is lost, - * but not guaranteed -- neither ISA207 nor ISA300 tests to reach - * here are the same as the test to restore NVGPRS: - * PACA_THREAD_IDLE_STATE test for ISA207, PSSCR test for ISA300, - * and SRR1 test for restoring NVGPRs. - * - * We are about to clobber NVGPRs now, so set NAPSTATELOST to - * guarantee they will always be restored. This might be tightened - * with careful reading of specs (particularly for ISA300) but this - * is already a slow wakeup path and it's simpler to be safe. - */ - li r0,1 - stb r0,PACA_NAPSTATELOST(r13) - - /* - * - * Save SRR1 and LR in NVGPRs as they might be clobbered in - * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required - * to determine the wakeup reason if we branch to kvm_start_guest. LR - * is required to return back to reset vector after hypervisor state - * restore is complete. - */ - mr r19,r12 - mr r18,r4 - mflr r17 -BEGIN_FTR_SECTION - CHECK_HMI_INTERRUPT -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - - ld r14,PACA_CORE_IDLE_STATE_PTR(r13) - lbz r7,PACA_THREAD_MASK(r13) - - /* - * Take the core lock to synchronize against other threads. - * - * Lock bit is set in one of the 2 cases- - * a. In the sleep/winkle enter path, the last thread is executing - * fastsleep workaround code. - * b. In the wake up path, another thread is executing fastsleep - * workaround undo code or resyncing timebase or restoring context - * In either case loop until the lock bit is cleared. - */ -1: - lwarx r15,0,r14 - andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h - bnel- core_idle_lock_held - oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h - stwcx. r15,0,r14 - bne- 1b - isync - - andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS - cmpwi cr2,r9,0 - - /* - * At this stage - * cr2 - eq if first thread to wakeup in core - * cr3- gt if waking up with partial/complete hypervisor state loss - * ISA300: - * cr4 - gt or eq if waking up from complete hypervisor state loss. - */ - -BEGIN_FTR_SECTION - /* - * Were we in winkle? - * If yes, check if all threads were in winkle, decrement our - * winkle count, set all thread winkle bits if all were in winkle. - * Check if our thread has a winkle bit set, and set cr4 accordingly - * (to match ISA300, above). Pseudo-code for core idle state - * transitions for ISA207 is as follows (everything happens atomically - * due to store conditional and/or lock bit): - * - * nap_idle() { } - * nap_wake() { } - * - * sleep_idle() - * { - * core_idle_state &= ~thread_in_core - * } - * - * sleep_wake() - * { - * bool first_in_core, first_in_subcore; - * - * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; - * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; - * - * core_idle_state |= thread_in_core; - * } - * - * winkle_idle() - * { - * core_idle_state &= ~thread_in_core; - * core_idle_state += 1 << WINKLE_COUNT_SHIFT; - * } - * - * winkle_wake() - * { - * bool first_in_core, first_in_subcore, winkle_state_lost; - * - * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; - * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; - * - * core_idle_state |= thread_in_core; - * - * if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT)) - * core_idle_state |= THREAD_WINKLE_BITS; - * core_idle_state -= 1 << WINKLE_COUNT_SHIFT; - * - * winkle_state_lost = core_idle_state & - * (thread_in_core << WINKLE_THREAD_SHIFT); - * core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT); - * } - * - */ - cmpwi r18,PNV_THREAD_WINKLE +_GLOBAL(isa206_idle_insn_mayloss) + std r1,PACAR1(r13) + mflr r4 + mfcr r5 + /* use stack red zone rather than a new frame for saving regs */ + std r2,-8*0(r1) + std r14,-8*1(r1) + std r15,-8*2(r1) + std r16,-8*3(r1) + std r17,-8*4(r1) + std r18,-8*5(r1) + std r19,-8*6(r1) + std r20,-8*7(r1) + std r21,-8*8(r1) + std r22,-8*9(r1) + std r23,-8*10(r1) + std r24,-8*11(r1) + std r25,-8*12(r1) + std r26,-8*13(r1) + std r27,-8*14(r1) + std r28,-8*15(r1) + std r29,-8*16(r1) + std r30,-8*17(r1) + std r31,-8*18(r1) + std r4,-8*19(r1) + std r5,-8*20(r1) + cmpwi r3,PNV_THREAD_NAP + bne 1f + IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) +1: cmpwi r3,PNV_THREAD_SLEEP bne 2f - andis. r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h - subis r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h - beq 2f - ori r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */ -2: - /* Shift thread bit to winkle mask, then test if this thread is set, - * and remove it from the winkle bits */ - slwi r8,r7,8 - and r8,r8,r15 - andc r15,r15,r8 - cmpwi cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */ - - lbz r4,PACA_SUBCORE_SIBLING_MASK(r13) - and r4,r4,r15 - cmpwi r4,0 /* Check if first in subcore */ - - or r15,r15,r7 /* Set thread bit */ - beq first_thread_in_subcore -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - - or r15,r15,r7 /* Set thread bit */ - beq cr2,first_thread_in_core - - /* Not first thread in core or subcore to wake up */ - b clear_lock - -first_thread_in_subcore: - /* - * If waking up from sleep, subcore state is not lost. Hence - * skip subcore state restore - */ - blt cr4,subcore_state_restored - - /* Restore per-subcore state */ - ld r4,_SDR1(r1) - mtspr SPRN_SDR1,r4 - - ld r4,_RPR(r1) - mtspr SPRN_RPR,r4 - ld r4,_AMOR(r1) - mtspr SPRN_AMOR,r4 - -subcore_state_restored: - /* - * Check if the thread is also the first thread in the core. If not, - * skip to clear_lock. - */ - bne cr2,clear_lock - -first_thread_in_core: - - /* - * First thread in the core waking up from any state which can cause - * partial or complete hypervisor state loss. It needs to - * call the fastsleep workaround code if the platform requires it. - * Call it unconditionally here. The below branch instruction will - * be patched out if the platform does not have fastsleep or does not - * require the workaround. Patching will be performed during the - * discovery of idle-states. - */ -.global pnv_fastsleep_workaround_at_exit -pnv_fastsleep_workaround_at_exit: - b fastsleep_workaround_at_exit - -timebase_resync: - /* - * Use cr3 which indicates that we are waking up with atleast partial - * hypervisor state loss to determine if TIMEBASE RESYNC is needed. - */ - ble cr3,.Ltb_resynced - /* Time base re-sync */ - bl opal_resync_timebase; - /* - * If waking up from sleep (POWER8), per core state - * is not lost, skip to clear_lock. - */ -.Ltb_resynced: - blt cr4,clear_lock - - /* - * First thread in the core to wake up and its waking up with - * complete hypervisor state loss. Restore per core hypervisor - * state. - */ -BEGIN_FTR_SECTION - ld r4,_PTCR(r1) - mtspr SPRN_PTCR,r4 - ld r4,_RPR(r1) - mtspr SPRN_RPR,r4 - ld r4,_AMOR(r1) - mtspr SPRN_AMOR,r4 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - - ld r4,_TSCR(r1) - mtspr SPRN_TSCR,r4 - ld r4,_WORC(r1) - mtspr SPRN_WORC,r4 - -clear_lock: - xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h - lwsync - stw r15,0(r14) - -common_exit: - /* - * Common to all threads. - * - * If waking up from sleep, hypervisor state is not lost. Hence - * skip hypervisor state restore. - */ - blt cr4,hypervisor_state_restored - - /* Waking up from winkle */ - -BEGIN_MMU_FTR_SECTION - b no_segments -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) - /* Restore SLB from PACA */ - ld r8,PACA_SLBSHADOWPTR(r13) - - .rept SLB_NUM_BOLTED - li r3, SLBSHADOW_SAVEAREA - LDX_BE r5, r8, r3 - addi r3, r3, 8 - LDX_BE r6, r8, r3 - andis. r7,r5,SLB_ESID_V@h - beq 1f - slbmte r6,r5 -1: addi r8,r8,16 - .endr -no_segments: - - /* Restore per thread state */ - - ld r4,_SPURR(r1) - mtspr SPRN_SPURR,r4 - ld r4,_PURR(r1) - mtspr SPRN_PURR,r4 - ld r4,_DSCR(r1) - mtspr SPRN_DSCR,r4 - ld r4,_WORT(r1) - mtspr SPRN_WORT,r4 - - /* Call cur_cpu_spec->cpu_restore() */ - LOAD_REG_ADDR(r4, cur_cpu_spec) - ld r4,0(r4) - ld r12,CPU_SPEC_RESTORE(r4) -#ifdef PPC64_ELF_ABI_v1 - ld r12,0(r12) -#endif - mtctr r12 - bctrl - -/* - * On POWER9, we can come here on wakeup from a cpuidle stop state. - * Hence restore the additional SPRs to the saved value. - * - * On POWER8, we come here only on winkle. Since winkle is used - * only in the case of CPU-Hotplug, we don't need to restore - * the additional SPRs. - */ -BEGIN_FTR_SECTION - bl power9_restore_additional_sprs -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) -hypervisor_state_restored: - - mr r12,r19 - mtlr r17 - blr /* return to pnv_powersave_wakeup */ - -fastsleep_workaround_at_exit: - li r3,1 - li r4,0 - bl opal_config_cpu_idle_state - b timebase_resync - -/* - * R3 here contains the value that will be returned to the caller - * of power7_nap. - * R12 contains SRR1 for CHECK_HMI_INTERRUPT. - */ -.global pnv_wakeup_loss -pnv_wakeup_loss: - ld r1,PACAR1(r13) -BEGIN_FTR_SECTION - CHECK_HMI_INTERRUPT -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - REST_NVGPRS(r1) - REST_GPR(2, r1) - ld r4,PACAKMSR(r13) - ld r5,_LINK(r1) - ld r6,_CCR(r1) - addi r1,r1,INT_FRAME_SIZE - mtlr r5 - mtcr r6 - mtmsrd r4 - blr + IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) +2: IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) -/* - * R3 here contains the value that will be returned to the caller - * of power7_nap. - * R12 contains SRR1 for CHECK_HMI_INTERRUPT. - */ -pnv_wakeup_noloss: - lbz r0,PACA_NAPSTATELOST(r13) - cmpwi r0,0 - bne pnv_wakeup_loss - ld r1,PACAR1(r13) -BEGIN_FTR_SECTION - CHECK_HMI_INTERRUPT -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r4,PACAKMSR(r13) - ld r5,_NIP(r1) - ld r6,_CCR(r1) - addi r1,r1,INT_FRAME_SIZE - mtlr r5 - mtcr r6 - mtmsrd r4 - blr diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2e5dfb6e08239..8b4858f822293 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -401,8 +401,8 @@ void __init check_for_initrd(void) #ifdef CONFIG_SMP -int threads_per_core, threads_per_subcore, threads_shift; -cpumask_t threads_core_mask; +int threads_per_core, threads_per_subcore, threads_shift __read_mostly; +cpumask_t threads_core_mask __read_mostly; EXPORT_SYMBOL_GPL(threads_per_core); EXPORT_SYMBOL_GPL(threads_per_subcore); EXPORT_SYMBOL_GPL(threads_shift); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 139027c62dc2f..dd014308f0650 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -35,6 +35,7 @@ #include #include #include +#include /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -45,6 +46,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* Values in HSTATE_NAPPING(r13) */ #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 +#define NAPPING_UNSPLIT 3 /* Stack frame offsets for kvmppc_hv_entry */ #define SFS 208 @@ -290,17 +292,19 @@ kvm_novcpu_exit: b kvmhv_switch_to_host /* - * We come in here when wakened from nap mode. - * Relocation is off and most register values are lost. - * r13 points to the PACA. + * We come in here when wakened from Linux offline idle code. + * Relocation is off * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ - .globl kvm_start_guest -kvm_start_guest: - /* Set runlatch bit the minute you wake up from nap */ - mfspr r0, SPRN_CTRLF - ori r0, r0, 1 - mtspr SPRN_CTRLT, r0 +_GLOBAL(idle_kvm_start_guest) + ld r4,PACAEMERGSP(r13) + mfcr r5 + mflr r0 + std r1,0(r4) + std r5,8(r4) + std r0,16(r4) + subi r1,r4,STACK_FRAME_OVERHEAD + SAVE_NVGPRS(r1) /* * Could avoid this and pass it through in r3. For now, @@ -308,27 +312,23 @@ kvm_start_guest: */ mtspr SPRN_SRR1,r3 - ld r2,PACATOC(r13) - li r0,0 stb r0,PACA_FTRACE_ENABLED(r13) li r0,KVM_HWTHREAD_IN_KVM stb r0,HSTATE_HWTHREAD_STATE(r13) - /* NV GPR values from power7_idle() will no longer be valid */ - li r0,1 - stb r0,PACA_NAPSTATELOST(r13) - - /* were we napping due to cede? */ + /* kvm cede / napping does not come through here */ lbz r0,HSTATE_NAPPING(r13) - cmpwi r0,NAPPING_CEDE - beq kvm_end_cede - cmpwi r0,NAPPING_NOVCPU - beq kvm_novcpu_wakeup + twnei r0,0 + + b 1f - ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD +kvm_unsplit_wakeup: + li r0, 0 + stb r0, HSTATE_NAPPING(r13) + +1: /* * We weren't napping due to cede, so this must be a secondary @@ -437,19 +437,25 @@ kvm_no_guest: lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 54f -/* - * We jump to pnv_wakeup_loss, which will return to the caller - * of power7_nap in the powernv cpu offline loop. The value we - * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss - * requires SRR1 in r12. - */ + + /* + * Jump to idle_return_gpr_loss, which returns to the + * idle_kvm_start_guest caller. + */ li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 - li r3, 0 - mfspr r12,SPRN_SRR1 - b pnv_wakeup_loss + /* set up r3 for return */ + mfspr r3,SPRN_SRR1 + REST_NVGPRS(r1) + addi r1, r1, STACK_FRAME_OVERHEAD + ld r0, 16(r1) + ld r5, 8(r1) + ld r1, 0(r1) + mtlr r0 + mtcr r5 + blr 53: HMT_LOW ld r5, HSTATE_KVM_VCORE(r13) @@ -534,6 +540,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) lbz r0, KVM_SPLIT_DO_NAP(r3) cmpwi r0, 0 beq 57f + li r3, NAPPING_UNSPLIT + stb r3, HSTATE_NAPPING(r13) li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4 mfspr r5, SPRN_LPCR rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) @@ -2657,6 +2665,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ + /* Go back to host stack */ + ld r1, HSTATE_HOST_R1(r13) + /* * Take a nap until a decrementer or external or doobell interrupt * occurs, with PECE1 and PECE0 set in LPCR. @@ -2685,26 +2696,42 @@ BEGIN_FTR_SECTION * requested level = 0 (just stop dispatching) */ lis r3, (PSSCR_EC | PSSCR_ESL)@h - mtspr SPRN_PSSCR, r3 /* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */ li r4, LPCR_PECE_HVEE@higher sldi r4, r4, 32 or r5, r5, r4 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +FTR_SECTION_ELSE + li r3, PNV_THREAD_NAP +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mtspr SPRN_LPCR,r5 isync - li r0, 0 - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b + BEGIN_FTR_SECTION - nap + bl isa300_idle_stop_mayloss FTR_SECTION_ELSE - PPC_STOP -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) - b . + bl isa206_idle_insn_mayloss +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) + + mfspr r0, SPRN_CTRLF + ori r0, r0, 1 + mtspr SPRN_CTRLT, r0 + + mtspr SPRN_SRR1, r3 + + li r0, 0 + stb r0, PACA_FTRACE_ENABLED(r13) + + li r0, KVM_HWTHREAD_IN_KVM + stb r0, HSTATE_HWTHREAD_STATE(r13) + + lbz r0, HSTATE_NAPPING(r13) + cmpwi r0, NAPPING_CEDE + beq kvm_end_cede + cmpwi r0, NAPPING_NOVCPU + beq kvm_novcpu_wakeup + cmpwi r0, NAPPING_UNSPLIT + beq kvm_unsplit_wakeup + twi 31,0,0 /* Nap state must not be zero */ 33: mr r4, r3 li r3, 0 @@ -2712,12 +2739,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) b 34f kvm_end_cede: + /* Woken by external or decrementer interrupt */ + /* get vcpu pointer */ ld r4, HSTATE_KVM_VCPU(r13) - /* Woken by external or decrementer interrupt */ - ld r1, HSTATE_HOST_R1(r13) - #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r4, VCPU_TB_RMINTR bl kvmhv_accumulate_time diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index e52f9b06dd9c3..87f5f4ae60caa 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -48,10 +49,10 @@ static u64 pnv_default_stop_mask; static bool default_stop_found; /* - * First deep stop state. Used to figure out when to save/restore - * hypervisor context. + * First stop state levels when SPR and TB loss can occur. */ -u64 pnv_first_deep_stop_state = MAX_STOP_STATE; +static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; +static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1; /* * psscr value and mask of the deepest stop idle state. @@ -62,6 +63,8 @@ static u64 pnv_deepest_stop_psscr_mask; static u64 pnv_deepest_stop_flag; static bool deepest_stop_found; +static unsigned long power7_offline_type; + static int pnv_save_sprs_for_deep_states(void) { int cpu; @@ -72,12 +75,12 @@ static int pnv_save_sprs_for_deep_states(void) * all cpus at boot. Get these reg values of current cpu and use the * same across all cpus. */ - uint64_t lpcr_val = mfspr(SPRN_LPCR); - uint64_t hid0_val = mfspr(SPRN_HID0); - uint64_t hid1_val = mfspr(SPRN_HID1); - uint64_t hid4_val = mfspr(SPRN_HID4); - uint64_t hid5_val = mfspr(SPRN_HID5); - uint64_t hmeer_val = mfspr(SPRN_HMEER); + uint64_t lpcr_val = mfspr(SPRN_LPCR); + uint64_t hid0_val = mfspr(SPRN_HID0); + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); + uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; uint64_t psscr_val = pnv_deepest_stop_psscr_val; @@ -137,89 +140,6 @@ static int pnv_save_sprs_for_deep_states(void) return 0; } -static void pnv_alloc_idle_core_states(void) -{ - int i, j; - int nr_cores = cpu_nr_cores(); - u32 *core_idle_state; - - /* - * core_idle_state - The lower 8 bits track the idle state of - * each thread of the core. - * - * The most significant bit is the lock bit. - * - * Initially all the bits corresponding to threads_per_core - * are set. They are cleared when the thread enters deep idle - * state like sleep and winkle/stop. - * - * Initially the lock bit is cleared. The lock bit has 2 - * purposes: - * a. While the first thread in the core waking up from - * idle is restoring core state, it prevents other - * threads in the core from switching to process - * context. - * b. While the last thread in the core is saving the - * core state, it prevents a different thread from - * waking up. - */ - for (i = 0; i < nr_cores; i++) { - int first_cpu = i * threads_per_core; - int node = cpu_to_node(first_cpu); - size_t paca_ptr_array_size; - - core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = (1 << threads_per_core) - 1; - paca_ptr_array_size = (threads_per_core * - sizeof(struct paca_struct *)); - - for (j = 0; j < threads_per_core; j++) { - int cpu = first_cpu + j; - - paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state; - paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING; - paca_ptrs[cpu]->thread_mask = 1 << j; - } - } - - update_subcore_sibling_mask(); - - if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) { - int rc = pnv_save_sprs_for_deep_states(); - - if (likely(!rc)) - return; - - /* - * The stop-api is unable to restore hypervisor - * resources on wakeup from platform idle states which - * lose full context. So disable such states. - */ - supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT; - pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n"); - pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); - - if (cpu_has_feature(CPU_FTR_ARCH_300) && - (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { - /* - * Use the default stop state for CPU-Hotplug - * if available. - */ - if (default_stop_found) { - pnv_deepest_stop_psscr_val = - pnv_default_stop_val; - pnv_deepest_stop_psscr_mask = - pnv_default_stop_mask; - pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", - pnv_deepest_stop_psscr_val); - } else { /* Fallback to snooze loop for CPU-Hotplug */ - deepest_stop_found = false; - pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); - } - } - } -} - u32 pnv_get_supported_cpuidle_states(void) { return supported_cpuidle_states; @@ -238,6 +158,9 @@ static void pnv_fastsleep_workaround_apply(void *info) *err = 1; } +static bool power7_fastsleep_workaround_entry = true; +static bool power7_fastsleep_workaround_exit = true; + /* * Used to store fastsleep workaround state * 0 - Workaround applied/undone at fastsleep entry/exit path (Default) @@ -269,21 +192,15 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, * fastsleep_workaround_applyonce = 1 implies * fastsleep workaround needs to be left in 'applied' state on all * the cores. Do this by- - * 1. Patching out the call to 'undo' workaround in fastsleep exit path - * 2. Sending ipi to all the cores which have at least one online thread - * 3. Patching out the call to 'apply' workaround in fastsleep entry - * path + * 1. Disable the 'undo' workaround in fastsleep exit path + * 2. Sendi IPIs to all the cores which have at least one online thread + * 3. Disable the 'apply' workaround in fastsleep entry path + * * There is no need to send ipi to cores which have all threads * offlined, as last thread of the core entering fastsleep or deeper * state would have applied workaround. */ - err = patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_exit, - PPC_INST_NOP); - if (err) { - pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit"); - goto fail; - } + power7_fastsleep_workaround_exit = false; get_online_cpus(); primary_thread_mask = cpu_online_cores_map(); @@ -296,13 +213,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, goto fail; } - err = patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_entry, - PPC_INST_NOP); - if (err) { - pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry"); - goto fail; - } + power7_fastsleep_workaround_entry = false; fastsleep_workaround_applyonce = 1; @@ -315,27 +226,323 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -static unsigned long __power7_idle_type(unsigned long type) +static inline void atomic_start_thread_idle(void) { + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + int thread_nr = cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + clear_bit(thread_nr, state); +} + +static inline void atomic_stop_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + int thread_nr = cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + set_bit(thread_nr, state); +} + +static inline void atomic_lock_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state))) + barrier(); +} + +static inline void atomic_unlock_and_stop_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long thread = 1UL << cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + u64 s = READ_ONCE(*state); + u64 new, tmp; + + BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT)); + BUG_ON(s & thread); + +again: + new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT; + tmp = cmpxchg(state, s, new); + if (unlikely(tmp != s)) { + s = tmp; + goto again; + } +} + +static inline void atomic_unlock_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state)); + clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state); +} + +/* P7 and P8 */ +struct p7_sprs { + /* per core */ + u64 tscr; + u64 worc; + + /* per subcore */ + u64 sdr1; + u64 rpr; + u64 amor; + + /* per thread */ + u64 lpcr; + u64 hfscr; + u64 fscr; + u64 purr; + u64 spurr; + u64 dscr; + u64 wort; +}; + +static unsigned long power7_idle_insn(unsigned long type) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long thread = 1UL << cpu_thread_in_core(cpu); + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; unsigned long srr1; + bool full_winkle; + struct p7_sprs sprs = {}; /* avoid false use-uninitialised */ + bool sprs_saved = false; + int rc; - if (!prep_irq_for_idle_irqsoff()) - return 0; + if (unlikely(type != PNV_THREAD_NAP)) { + atomic_lock_thread_idle(); + + BUG_ON(!(*state & thread)); + *state &= ~thread; + + if (power7_fastsleep_workaround_entry) { + if ((*state & core_thread_mask) == 0) { + rc = opal_config_cpu_idle_state( + OPAL_CONFIG_IDLE_FASTSLEEP, + OPAL_CONFIG_IDLE_APPLY); + BUG_ON(rc); + } + } + + if (type == PNV_THREAD_WINKLE) { + sprs.tscr = mfspr(SPRN_TSCR); + sprs.worc = mfspr(SPRN_WORC); + + sprs.sdr1 = mfspr(SPRN_SDR1); + sprs.rpr = mfspr(SPRN_RPR); + sprs.amor = mfspr(SPRN_AMOR); + + sprs.lpcr = mfspr(SPRN_LPCR); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + sprs.hfscr = mfspr(SPRN_HFSCR); + sprs.fscr = mfspr(SPRN_FSCR); + } + sprs.purr = mfspr(SPRN_PURR); + sprs.spurr = mfspr(SPRN_SPURR); + sprs.dscr = mfspr(SPRN_DSCR); + sprs.wort = mfspr(SPRN_WORT); + + sprs_saved = true; + + /* + * Increment winkle counter and set all winkle bits if + * all threads are winkling. This allows wakeup side to + * distinguish between fast sleep and winkle state + * loss. Fast sleep still has to resync the timebase so + * this may not be a really big win. + */ + *state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) + >> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT + == threads_per_core) + *state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS; + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + } + + atomic_unlock_thread_idle(); + } + + local_paca->thread_idle_state = type; + srr1 = isa206_idle_insn_mayloss(type); /* go idle */ + local_paca->thread_idle_state = PNV_THREAD_RUNNING; + + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); + + if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) { + if (unlikely(type != PNV_THREAD_NAP)) { + atomic_lock_thread_idle(); + if (type == PNV_THREAD_WINKLE) { + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT); + } + atomic_unlock_and_stop_thread_idle(); + } + return srr1; + } + + /* HV state loss */ + BUG_ON(type == PNV_THREAD_NAP); + + atomic_lock_thread_idle(); + + full_winkle = false; + if (type == PNV_THREAD_WINKLE) { + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) { + *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT); + full_winkle = true; + BUG_ON(!sprs_saved); + } + } + + WARN_ON(*state & thread); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* Per-core SPRs */ + if (full_winkle) { + mtspr(SPRN_TSCR, sprs.tscr); + mtspr(SPRN_WORC, sprs.worc); + } + + if (power7_fastsleep_workaround_exit) { + rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP, + OPAL_CONFIG_IDLE_UNDO); + BUG_ON(rc); + } + + /* TB */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + +core_woken: + if (!full_winkle) + goto subcore_woken; + + if ((*state & local_paca->subcore_sibling_mask) != 0) + goto subcore_woken; + + /* Per-subcore SPRs */ + mtspr(SPRN_SDR1, sprs.sdr1); + mtspr(SPRN_RPR, sprs.rpr); + mtspr(SPRN_AMOR, sprs.amor); + +subcore_woken: + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + atomic_unlock_and_stop_thread_idle(); + + /* Fast sleep does not lose SPRs */ + if (!full_winkle) + return srr1; + + /* Per-thread SPRs */ + mtspr(SPRN_LPCR, sprs.lpcr); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + mtspr(SPRN_HFSCR, sprs.hfscr); + mtspr(SPRN_FSCR, sprs.fscr); + } + mtspr(SPRN_PURR, sprs.purr); + mtspr(SPRN_SPURR, sprs.spurr); + mtspr(SPRN_DSCR, sprs.dscr); + mtspr(SPRN_WORT, sprs.wort); + + mtspr(SPRN_SPRG3, local_paca->sprg_vdso); + + /* + * The SLB has to be restored here, but it sometimes still + * contains entries, so the __ variant must be used to prevent + * multi hits. + */ + __slb_restore_bolted_realmode(); + + return srr1; +} + +extern unsigned long idle_kvm_start_guest(unsigned long srr1); + +#ifdef CONFIG_HOTPLUG_CPU +static unsigned long power7_offline(void) +{ + unsigned long srr1; + + mtmsr(MSR_IDLE); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* Tell KVM we're entering idle. */ + /******************************************************/ + /* N O T E W E L L ! ! ! N O T E W E L L */ + /* The following store to HSTATE_HWTHREAD_STATE(r13) */ + /* MUST occur in real mode, i.e. with the MMU off, */ + /* and the MMU must stay off until we clear this flag */ + /* and test HSTATE_HWTHREAD_REQ(r13) in */ + /* pnv_powersave_wakeup in this file. */ + /* The reason is that another thread can switch the */ + /* MMU to a guest context whenever this flag is set */ + /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ + /* that would potentially cause this thread to start */ + /* executing instructions from guest memory in */ + /* hypervisor mode, leading to a host crash or data */ + /* corruption, or worse. */ + /******************************************************/ + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; +#endif __ppc64_runlatch_off(); - srr1 = power7_idle_insn(type); + srr1 = power7_idle_insn(power7_offline_type); __ppc64_runlatch_on(); - fini_irq_for_idle_irqsoff(); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; + /* Order setting hwthread_state vs. testing hwthread_req */ + smp_mb(); + if (local_paca->kvm_hstate.hwthread_req) + srr1 = idle_kvm_start_guest(srr1); +#endif + + mtmsr(MSR_KERNEL); return srr1; } +#endif void power7_idle_type(unsigned long type) { unsigned long srr1; - srr1 = __power7_idle_type(type); + if (!prep_irq_for_idle_irqsoff()) + return; + + mtmsr(MSR_IDLE); + __ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + __ppc64_runlatch_on(); + mtmsr(MSR_KERNEL); + + fini_irq_for_idle_irqsoff(); irq_set_pending_from_srr1(srr1); } @@ -347,33 +554,275 @@ void power7_idle(void) power7_idle_type(PNV_THREAD_NAP); } -static unsigned long __power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) +struct p9_sprs { + /* per core */ + u64 ptcr; + u64 rpr; + u64 tscr; + u64 ldbar; + u64 amor; + + /* per thread */ + u64 lpcr; + u64 hfscr; + u64 fscr; + u64 pid; + u64 purr; + u64 spurr; + u64 dscr; + u64 wort; + + u64 mmcra; + u32 mmcr0; + u32 mmcr1; + u64 mmcr2; +}; + +static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) { - unsigned long psscr; + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; unsigned long srr1; + unsigned long pls; + unsigned long mmcr0 = 0; + struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ + bool sprs_saved = false; - if (!prep_irq_for_idle_irqsoff()) - return 0; + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { + /* EC=ESL=0 case */ + + BUG_ON(!mmu_on); + + /* + * Wake synchronously. SRESET via xscom may still cause + * a 0x100 powersave wakeup with SRR1 reason! + */ + srr1 = isa300_idle_stop_noloss(psscr); /* go idle */ + if (likely(!srr1)) + return 0; + + /* + * Registers not saved, can't recover! + * This would be a hardware bug + */ + BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS); + + goto out; + } + + /* EC=ESL=1 case */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) { + local_paca->requested_psscr = psscr; + /* order setting requested_psscr vs testing dont_stop */ + smp_mb(); + if (atomic_read(&local_paca->dont_stop)) { + local_paca->requested_psscr = 0; + return 0; + } + } +#endif + + if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) { + /* + * POWER9 DD2 can incorrectly set PMAO when waking up + * after a state-loss idle. Saving and restoring MMCR0 + * over idle is a workaround. + */ + mmcr0 = mfspr(SPRN_MMCR0); + } + if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) { + sprs.lpcr = mfspr(SPRN_LPCR); + sprs.hfscr = mfspr(SPRN_HFSCR); + sprs.fscr = mfspr(SPRN_FSCR); + sprs.pid = mfspr(SPRN_PID); + sprs.purr = mfspr(SPRN_PURR); + sprs.spurr = mfspr(SPRN_SPURR); + sprs.dscr = mfspr(SPRN_DSCR); + sprs.wort = mfspr(SPRN_WORT); + + sprs.mmcra = mfspr(SPRN_MMCRA); + sprs.mmcr0 = mfspr(SPRN_MMCR0); + sprs.mmcr1 = mfspr(SPRN_MMCR1); + sprs.mmcr2 = mfspr(SPRN_MMCR2); + + sprs.ptcr = mfspr(SPRN_PTCR); + sprs.rpr = mfspr(SPRN_RPR); + sprs.tscr = mfspr(SPRN_TSCR); + sprs.ldbar = mfspr(SPRN_LDBAR); + sprs.amor = mfspr(SPRN_AMOR); + + sprs_saved = true; + + atomic_start_thread_idle(); + } + + srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + local_paca->requested_psscr = 0; +#endif psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + + if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { + unsigned long mmcra; + + /* + * Workaround for POWER9 DD2.0, if we lost resources, the ERAT + * might have been corrupted and needs flushing. We also need + * to reload MMCR0 (see mmcr0 comment above). + */ + if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) { + asm volatile(PPC_INVALIDATE_ERAT); + mtspr(SPRN_MMCR0, mmcr0); + } + + /* + * DD2.2 and earlier need to set then clear bit 60 in MMCRA + * to ensure the PMU starts running. + */ + mmcra = mfspr(SPRN_MMCRA); + mmcra |= PPC_BIT(60); + mtspr(SPRN_MMCRA, mmcra); + mmcra &= ~PPC_BIT(60); + mtspr(SPRN_MMCRA, mmcra); + } + + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); + + /* + * On POWER9, SRR1 bits do not match exactly as expected. + * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so + * just always test PSSCR for SPR/TB state loss. + */ + pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; + if (likely(pls < pnv_first_spr_loss_level)) { + if (sprs_saved) + atomic_stop_thread_idle(); + goto out; + } + + /* HV state loss */ + BUG_ON(!sprs_saved); + + atomic_lock_thread_idle(); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* Per-core SPRs */ + mtspr(SPRN_PTCR, sprs.ptcr); + mtspr(SPRN_RPR, sprs.rpr); + mtspr(SPRN_TSCR, sprs.tscr); + mtspr(SPRN_LDBAR, sprs.ldbar); + mtspr(SPRN_AMOR, sprs.amor); + + if (pls >= pnv_first_tb_loss_level) { + /* TB loss */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + } + + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + +core_woken: + atomic_unlock_and_stop_thread_idle(); + + /* Per-thread SPRs */ + mtspr(SPRN_LPCR, sprs.lpcr); + mtspr(SPRN_HFSCR, sprs.hfscr); + mtspr(SPRN_FSCR, sprs.fscr); + mtspr(SPRN_PID, sprs.pid); + mtspr(SPRN_PURR, sprs.purr); + mtspr(SPRN_SPURR, sprs.spurr); + mtspr(SPRN_DSCR, sprs.dscr); + mtspr(SPRN_WORT, sprs.wort); + + mtspr(SPRN_MMCRA, sprs.mmcra); + mtspr(SPRN_MMCR0, sprs.mmcr0); + mtspr(SPRN_MMCR1, sprs.mmcr1); + mtspr(SPRN_MMCR2, sprs.mmcr2); + + mtspr(SPRN_SPRG3, local_paca->sprg_vdso); + + if (!radix_enabled()) + __slb_restore_bolted_realmode(); + +out: + if (mmu_on) + mtmsr(MSR_KERNEL); + + return srr1; +} + +#ifdef CONFIG_HOTPLUG_CPU +static unsigned long power9_offline_stop(unsigned long psscr) +{ + unsigned long srr1; + +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE + __ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr, true); + __ppc64_runlatch_on(); +#else + /* + * Tell KVM we're entering idle. + * This does not have to be done in real mode because the P9 MMU + * is independent per-thread. Some steppings share radix/hash mode + * between threads, but in that case KVM has a barrier sync in real + * mode before and after switching between radix and hash. + * + * kvm_start_guest must still be called in real mode though, hence + * the false argument. + */ + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr); + srr1 = power9_idle_stop(psscr, false); __ppc64_runlatch_on(); - fini_irq_for_idle_irqsoff(); + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; + /* Order setting hwthread_state vs. testing hwthread_req */ + smp_mb(); + if (local_paca->kvm_hstate.hwthread_req) + srr1 = idle_kvm_start_guest(srr1); + mtmsr(MSR_KERNEL); +#endif return srr1; } +#endif void power9_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask) { + unsigned long psscr; unsigned long srr1; - srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); + if (!prep_irq_for_idle_irqsoff()) + return; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + __ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr, true); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + irq_set_pending_from_srr1(srr1); } @@ -409,7 +858,7 @@ void pnv_power9_force_smt4_catch(void) atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop); } /* order setting dont_stop vs testing requested_psscr */ - mb(); + smp_mb(); for (thr = 0; thr < threads_per_core; ++thr) { if (!paca_ptrs[cpu0+thr]->requested_psscr) ++awake_threads; @@ -481,7 +930,6 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; - u32 idle_states = pnv_get_supported_cpuidle_states(); __ppc64_runlatch_off(); @@ -492,15 +940,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu) psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | pnv_deepest_stop_psscr_val; srr1 = power9_offline_stop(psscr); - - } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) && - (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) { - srr1 = power7_idle_insn(PNV_THREAD_WINKLE); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_idle_insn(PNV_THREAD_SLEEP); - } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_idle_insn(PNV_THREAD_NAP); + } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) { + srr1 = power7_offline(); } else { /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { @@ -596,33 +1037,44 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static int __init pnv_power9_idle_init(void) +static void __init pnv_power9_idle_init(void) { u64 max_residency_ns = 0; int i; /* - * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask}, - * and the pnv_default_stop_{val,mask}. - * - * pnv_first_deep_stop_state should be set to the first stop - * level to cause hypervisor state loss. - * * pnv_deepest_stop_{val,mask} should be set to values corresponding to * the deepest stop state. * * pnv_default_stop_{val,mask} should be set to values corresponding to - * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state. + * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state. */ - pnv_first_deep_stop_state = MAX_STOP_STATE; + pnv_first_tb_loss_level = MAX_STOP_STATE + 1; + pnv_first_spr_loss_level = MAX_STOP_STATE + 1; for (i = 0; i < nr_pnv_idle_states; i++) { int err; struct pnv_idle_states_t *state = &pnv_idle_states[i]; u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; + if ((state->flags & OPAL_PM_TIMEBASE_STOP) && + (pnv_first_tb_loss_level > psscr_rl)) + pnv_first_tb_loss_level = psscr_rl; + if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && - pnv_first_deep_stop_state > psscr_rl) - pnv_first_deep_stop_state = psscr_rl; + (pnv_first_spr_loss_level > psscr_rl)) + pnv_first_spr_loss_level = psscr_rl; + + /* + * The idle code does not deal with TB loss occurring + * in a shallower state than SPR loss, so force it to + * behave like SPRs are lost if TB is lost. POWER9 would + * never encouter this, but a POWER8 core would if it + * implemented the stop instruction. So this is for forward + * compatibility. + */ + if ((state->flags & OPAL_PM_TIMEBASE_STOP) && + (pnv_first_spr_loss_level > psscr_rl)) + pnv_first_spr_loss_level = psscr_rl; err = validate_psscr_val_mask(&state->psscr_val, &state->psscr_mask, @@ -647,6 +1099,7 @@ static int __init pnv_power9_idle_init(void) pnv_default_stop_val = state->psscr_val; pnv_default_stop_mask = state->psscr_mask; default_stop_found = true; + WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT); } } @@ -666,10 +1119,40 @@ static int __init pnv_power9_idle_init(void) pnv_deepest_stop_psscr_mask); } - pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", - pnv_first_deep_stop_state); + pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n", + pnv_first_spr_loss_level); - return 0; + pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n", + pnv_first_tb_loss_level); +} + +static void __init pnv_disable_deep_states(void) +{ + /* + * The stop-api is unable to restore hypervisor + * resources on wakeup from platform idle states which + * lose full context. So disable such states. + */ + supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT; + pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n"); + pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { + /* + * Use the default stop state for CPU-Hotplug + * if available. + */ + if (default_stop_found) { + pnv_deepest_stop_psscr_val = pnv_default_stop_val; + pnv_deepest_stop_psscr_mask = pnv_default_stop_mask; + pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", + pnv_deepest_stop_psscr_val); + } else { /* Fallback to snooze loop for CPU-Hotplug */ + deepest_stop_found = false; + pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); + } + } } /* @@ -684,10 +1167,8 @@ static void __init pnv_probe_idle_states(void) return; } - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (pnv_power9_idle_init()) - return; - } + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pnv_power9_idle_init(); for (i = 0; i < nr_pnv_idle_states; i++) supported_cpuidle_states |= pnv_idle_states[i].flags; @@ -807,11 +1288,33 @@ static int pnv_parse_cpuidle_dt(void) static int __init pnv_init_idle_states(void) { + int cpu; int rc = 0; - supported_cpuidle_states = 0; + + /* Set up PACA fields */ + for_each_present_cpu(cpu) { + struct paca_struct *p = paca_ptrs[cpu]; + + p->idle_state = 0; + if (cpu == cpu_first_thread_sibling(cpu)) + p->idle_state = (1 << threads_per_core) - 1; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* P7/P8 nap */ + p->thread_idle_state = PNV_THREAD_RUNNING; + } else { + /* P9 stop */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + p->requested_psscr = 0; + atomic_set(&p->dont_stop, 0); +#endif + } + } /* In case we error out nr_pnv_idle_states will be zero */ nr_pnv_idle_states = 0; + supported_cpuidle_states = 0; + if (cpuidle_disable != IDLE_NO_OVERRIDE) goto out; rc = pnv_parse_cpuidle_dt(); @@ -819,27 +1322,40 @@ static int __init pnv_init_idle_states(void) return rc; pnv_probe_idle_states(); - if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_entry, - PPC_INST_NOP); - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_exit, - PPC_INST_NOP); - } else { - /* - * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that - * workaround is needed to use fastsleep. Provide sysfs - * control to choose how this workaround has to be applied. - */ - device_create_file(cpu_subsys.dev_root, + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + power7_fastsleep_workaround_entry = false; + power7_fastsleep_workaround_exit = false; + } else { + /* + * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that + * workaround is needed to use fastsleep. Provide sysfs + * control to choose how this workaround has to be + * applied. + */ + device_create_file(cpu_subsys.dev_root, &dev_attr_fastsleep_workaround_applyonce); - } + } + + update_subcore_sibling_mask(); + + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) { + ppc_md.power_save = power7_idle; + power7_offline_type = PNV_THREAD_NAP; + } - pnv_alloc_idle_core_states(); + if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) && + (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)) + power7_offline_type = PNV_THREAD_WINKLE; + else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) || + (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) + power7_offline_type = PNV_THREAD_SLEEP; + } - if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) - ppc_md.power_save = power7_idle; + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) { + if (pnv_save_sprs_for_deep_states()) + pnv_disable_deep_states(); + } out: return 0; diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 45563004feda6..1d7a9fd30dd14 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -183,7 +183,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_idle_insn(PNV_THREAD_NAP); + power7_idle_type(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a0f44f9923608..e583ed3f6b93a 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2431,7 +2431,6 @@ static void dump_one_paca(int cpu) DUMP(p, irq_happened, "%#-*x"); DUMP(p, io_sync, "%#-*x"); DUMP(p, irq_work_pending, "%#-*x"); - DUMP(p, nap_state_lost, "%#-*x"); DUMP(p, sprg_vdso, "%#-*llx"); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -2439,19 +2438,16 @@ static void dump_one_paca(int cpu) #endif #ifdef CONFIG_PPC_POWERNV - DUMP(p, core_idle_state_ptr, "%-*px"); - DUMP(p, thread_idle_state, "%#-*x"); - DUMP(p, thread_mask, "%#-*x"); - DUMP(p, subcore_sibling_mask, "%#-*x"); - DUMP(p, requested_psscr, "%#-*llx"); - DUMP(p, stop_sprs.pid, "%#-*llx"); - DUMP(p, stop_sprs.ldbar, "%#-*llx"); - DUMP(p, stop_sprs.fscr, "%#-*llx"); - DUMP(p, stop_sprs.hfscr, "%#-*llx"); - DUMP(p, stop_sprs.mmcr1, "%#-*llx"); - DUMP(p, stop_sprs.mmcr2, "%#-*llx"); - DUMP(p, stop_sprs.mmcra, "%#-*llx"); - DUMP(p, dont_stop.counter, "%#-*x"); + DUMP(p, idle_state, "%#-*lx"); + if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) { + DUMP(p, thread_idle_state, "%#-*x"); + DUMP(p, subcore_sibling_mask, "%#-*x"); + } else { +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + DUMP(p, requested_psscr, "%#-*llx"); + DUMP(p, dont_stop.counter, "%#-*x"); +#endif + } #endif DUMP(p, accounting.utime, "%#-*lx"); From e9cef0189c5b217fcd4788562862defc27632a01 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 30 Apr 2019 14:28:17 +1000 Subject: [PATCH 064/205] powerpc/powernv/idle: Restore AMR/UAMOR/AMOR/IAMR after idle This is an implementation of commits 53a712bae5dd ("powerpc/powernv/idle: Restore AMR/UAMOR/AMOR after idle") and a3f3072db6ca ("powerpc/powernv/idle: Restore IAMR after idle") using the new C-based idle code. Signed-off-by: Nicholas Piggin [mpe: Extract from Nick's patch] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/idle.c | 52 +++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 87f5f4ae60caa..c9133f7908ca1 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -296,7 +296,6 @@ struct p7_sprs { /* per subcore */ u64 sdr1; u64 rpr; - u64 amor; /* per thread */ u64 lpcr; @@ -306,6 +305,12 @@ struct p7_sprs { u64 spurr; u64 dscr; u64 wort; + + /* per thread SPRs that get lost in shallow states */ + u64 amr; + u64 iamr; + u64 amor; + u64 uamor; }; static unsigned long power7_idle_insn(unsigned long type) @@ -342,7 +347,6 @@ static unsigned long power7_idle_insn(unsigned long type) sprs.sdr1 = mfspr(SPRN_SDR1); sprs.rpr = mfspr(SPRN_RPR); - sprs.amor = mfspr(SPRN_AMOR); sprs.lpcr = mfspr(SPRN_LPCR); if (cpu_has_feature(CPU_FTR_ARCH_207S)) { @@ -374,6 +378,13 @@ static unsigned long power7_idle_insn(unsigned long type) atomic_unlock_thread_idle(); } + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + sprs.amr = mfspr(SPRN_AMR); + sprs.iamr = mfspr(SPRN_IAMR); + sprs.amor = mfspr(SPRN_AMOR); + sprs.uamor = mfspr(SPRN_UAMOR); + } + local_paca->thread_idle_state = type; srr1 = isa206_idle_insn_mayloss(type); /* go idle */ local_paca->thread_idle_state = PNV_THREAD_RUNNING; @@ -381,6 +392,19 @@ static unsigned long power7_idle_insn(unsigned long type) WARN_ON_ONCE(!srr1); WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { + /* + * We don't need an isync after the mtsprs here because + * the upcoming mtmsrd is execution synchronizing. + */ + mtspr(SPRN_AMR, sprs.amr); + mtspr(SPRN_IAMR, sprs.iamr); + mtspr(SPRN_AMOR, sprs.amor); + mtspr(SPRN_UAMOR, sprs.uamor); + } + } + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) hmi_exception_realmode(NULL); @@ -444,7 +468,6 @@ static unsigned long power7_idle_insn(unsigned long type) /* Per-subcore SPRs */ mtspr(SPRN_SDR1, sprs.sdr1); mtspr(SPRN_RPR, sprs.rpr); - mtspr(SPRN_AMOR, sprs.amor); subcore_woken: /* @@ -560,7 +583,6 @@ struct p9_sprs { u64 rpr; u64 tscr; u64 ldbar; - u64 amor; /* per thread */ u64 lpcr; @@ -576,6 +598,12 @@ struct p9_sprs { u32 mmcr0; u32 mmcr1; u64 mmcr2; + + /* per thread SPRs that get lost in shallow states */ + u64 amr; + u64 iamr; + u64 amor; + u64 uamor; }; static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) @@ -652,13 +680,17 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.rpr = mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); sprs.ldbar = mfspr(SPRN_LDBAR); - sprs.amor = mfspr(SPRN_AMOR); sprs_saved = true; atomic_start_thread_idle(); } + sprs.amr = mfspr(SPRN_AMR); + sprs.iamr = mfspr(SPRN_IAMR); + sprs.amor = mfspr(SPRN_AMOR); + sprs.uamor = mfspr(SPRN_UAMOR); + srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -673,6 +705,15 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { unsigned long mmcra; + /* + * We don't need an isync after the mtsprs here because the + * upcoming mtmsrd is execution synchronizing. + */ + mtspr(SPRN_AMR, sprs.amr); + mtspr(SPRN_IAMR, sprs.iamr); + mtspr(SPRN_AMOR, sprs.amor); + mtspr(SPRN_UAMOR, sprs.uamor); + /* * Workaround for POWER9 DD2.0, if we lost resources, the ERAT * might have been corrupted and needs flushing. We also need @@ -722,7 +763,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) mtspr(SPRN_RPR, sprs.rpr); mtspr(SPRN_TSCR, sprs.tscr); mtspr(SPRN_LDBAR, sprs.ldbar); - mtspr(SPRN_AMOR, sprs.amor); if (pls >= pnv_first_tb_loss_level) { /* TB loss */ From b511cdd1c12d1e450baeba5373dd3a8897396e2b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 10 Apr 2019 16:48:00 +1000 Subject: [PATCH 065/205] powerpc/powernv/ioda: Handle failures correctly in pnv_pci_ioda_iommu_bypass_supported() When the return value type was changed from int to bool, few places were left unchanged, this fixes them. We did not hit these failures as the first one is not happening at all and the second one is little more likely to happen if the user switches a 33..58bit DMA capable device between the VFIO and vendor drivers and there are not so many of these. Fixes: 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3ead4c237ed0e..9a9076a5686cd 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1836,7 +1836,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, struct pnv_ioda_pe *pe; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENODEV; + return false; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { @@ -1859,7 +1859,7 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, /* Configure the bypass mode */ s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); if (rc) - return rc; + return false; /* 4GB offset bypasses 32-bit space */ pdev->dev.archdata.dma_offset = (1ULL << 32); return true; From 33dda8c32714c1a8f318450af4d1f9f123e2ed24 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 23 Apr 2019 14:11:14 -0700 Subject: [PATCH 066/205] powerpc/vdso: Drop unnecessary cc-ldoption Towards the goal of removing cc-ldoption, it seems that --hash-style= was added to binutils 2.17.50.0.2 in 2006. The minimal required version of binutils for the kernel according to Documentation/process/changes.rst is 2.20. Suggested-by: Masahiro Yamada Signed-off-by: Nick Desaulniers Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vdso32/Makefile | 5 ++--- arch/powerpc/kernel/vdso64/Makefile | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index ce199f6e4256d..06f54d947057f 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -26,9 +26,8 @@ GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ + -Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both asflags-y := -D__VDSO32__ -s obj-y += vdso32_wrapper.o diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 28e7d112aa2fc..32ebb3522ea19 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -12,9 +12,8 @@ GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) +ccflags-y := -shared -fno-common -fno-builtin -nostdlib \ + -Wl,-soname=linux-vdso64.so.1 -Wl,--hash-style=both asflags-y := -D__VDSO64__ -s obj-y += vdso64_wrapper.o From 7e8039795a80bdf1418964b9cabef6168bc5d9a4 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Tue, 30 Apr 2019 11:09:23 +1000 Subject: [PATCH 067/205] powerpc/cacheinfo: Fix kobject memleak Currently error return from kobject_init_and_add() is not followed by a call to kobject_put(). This means there is a memory leak. Add call to kobject_put() in error path of kobject_init_and_add(). Signed-off-by: Tobin C. Harding Reviewed-by: Greg Kroah-Hartman Reviewed-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/cacheinfo.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 53102764fd2f8..f2ed3ef4b129d 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -759,23 +759,22 @@ static void cacheinfo_create_index_dir(struct cache *cache, int index, index_dir = kzalloc(sizeof(*index_dir), GFP_KERNEL); if (!index_dir) - goto err; + return; index_dir->cache = cache; rc = kobject_init_and_add(&index_dir->kobj, &cache_index_type, cache_dir->kobj, "index%d", index); - if (rc) - goto err; + if (rc) { + kobject_put(&index_dir->kobj); + kfree(index_dir); + return; + } index_dir->next = cache_dir->index; cache_dir->index = index_dir; cacheinfo_create_index_opt_attrs(index_dir); - - return; -err: - kfree(index_dir); } static void cacheinfo_sysfs_populate(unsigned int cpu_id, From a5ae043de7678f189303559782f6057078459a41 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Wed, 13 Mar 2019 21:00:30 +0100 Subject: [PATCH 068/205] powerpc/64s: Remove 'dummy_copy_buffer' In commit 2bf1071a8d50 ("powerpc/64s: Remove POWER9 DD1 support") the function __switch_to remove usage for 'dummy_copy_buffer'. Since it is not used anywhere else, remove it completely. This remove the following warning: arch/powerpc/kernel/process.c:1156:17: error: 'dummy_copy_buffer' defined but not used Suggested-by: Christophe Leroy Signed-off-by: Mathieu Malaterre Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 64e1494d3a1d4..0c20173570731 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1152,11 +1152,6 @@ static inline void restore_sprs(struct thread_struct *old_thread, thread_pkey_regs_restore(new_thread, old_thread); } -#ifdef CONFIG_PPC_BOOK3S_64 -#define CP_SIZE 128 -static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); -#endif - struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { From 32eeb5614d3bf166e84fe69bb5f3a51a48cac7a1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 29 Mar 2019 23:44:56 +0800 Subject: [PATCH 069/205] ocxl: remove set but not used variables 'tid' and 'lpid' Fixes gcc '-Wunused-but-set-variable' warning: drivers/misc/ocxl/link.c: In function 'xsl_fault_handler': drivers/misc/ocxl/link.c:187:17: warning: variable 'tid' set but not used drivers/misc/ocxl/link.c:187:6: warning: variable 'lpid' set but not used They are never used and can be removed. Signed-off-by: YueHaibing Reviewed-by: Mukesh Ojha Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/link.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 04ec3d74f828c..c2eaa5d64176c 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -184,7 +184,7 @@ static irqreturn_t xsl_fault_handler(int irq, void *data) u64 dsisr, dar, pe_handle; struct pe_data *pe_data; struct ocxl_process_element *pe; - int lpid, pid, tid; + int pid; bool schedule = false; read_irq(spa, &dsisr, &dar, &pe_handle); @@ -192,9 +192,7 @@ static irqreturn_t xsl_fault_handler(int irq, void *data) WARN_ON(pe_handle > SPA_PE_MASK); pe = spa->spa_mem + pe_handle; - lpid = be32_to_cpu(pe->lpid); pid = be32_to_cpu(pe->pid); - tid = be32_to_cpu(pe->tid); /* We could be reading all null values here if the PE is being * removed while an interrupt kicks in. It's not supposed to * happen if the driver notified the AFU to terminate the From 5b2a15296210d3b70e06d0f09a8e701ff74ccbe8 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 4 Oct 2018 16:23:37 +1000 Subject: [PATCH 070/205] powerpc: Add doorbell tracepoints When analysing sources of OS jitter, I noticed that doorbells cannot be traced. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/trace.h | 16 ++++++++++++++++ arch/powerpc/kernel/dbell.c | 3 +++ 2 files changed, 19 insertions(+) diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index 58ef8c43a89d8..08cd60cd70b74 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -54,6 +54,22 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit, TP_ARGS(regs) ); +#ifdef CONFIG_PPC_DOORBELL +DEFINE_EVENT(ppc64_interrupt_class, doorbell_entry, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs) +); + +DEFINE_EVENT(ppc64_interrupt_class, doorbell_exit, + + TP_PROTO(struct pt_regs *regs), + + TP_ARGS(regs) +); +#endif + #ifdef CONFIG_PPC_PSERIES extern int hcall_tracepoint_regfunc(void); extern void hcall_tracepoint_unregfunc(void); diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index b6fe883b1016d..5ec3b38359251 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_SMP @@ -81,6 +82,7 @@ void doorbell_exception(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); irq_enter(); + trace_doorbell_entry(regs); ppc_msgsync(); @@ -91,6 +93,7 @@ void doorbell_exception(struct pt_regs *regs) smp_ipi_demux_relaxed(); /* already performed the barrier */ + trace_doorbell_exit(regs); irq_exit(); set_irq_regs(old_regs); } From d6e8a150850601277039a548ffcdddd1bfe3e365 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 29 Apr 2019 23:45:48 +0530 Subject: [PATCH 071/205] powerpc/powernv/mce: Reduce MCE console logs to lesser lines. Also add cpu number while displaying MCE log. This will help cleaner logs when MCE hits on multiple cpus simultaneously. Before the changes the MCE output was: Severe Machine check interrupt [Recovered] NIP [d00000000ba80280]: insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb] Initiator: CPU Error type: SLB [Multihit] Effective address: d00000000ba80280 After this patch series changes the MCE output will be: MCE: CPU80: machine check (Warning) Host SLB Multihit [Recovered] MCE: CPU80: NIP: [d00000000b550280] insert_slb_entry.constprop.0+0x278/0x2c0 [mcetest_slb] MCE: CPU80: Probable software error (some chance of hardware cause) UE in host application: MCE: CPU48: machine check (Severe) Host UE Load/Store DAR: 00007fffc6079a80 paddr: 0000000f8e260000 [Not recovered] MCE: CPU48: PID: 4584 Comm: find NIP: [0000000010023368] MCE: CPU48: Hardware error and for MCE in Guest: MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered] MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60] MCE: CPU80: Probable software error (some chance of hardware cause) Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/kernel/mce.c | 89 ++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index ad47fa8653240..c888ef9a3eafb 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -116,7 +116,7 @@ struct machine_check_event { enum MCE_Initiator initiator:8; /* 0x03 */ enum MCE_ErrorType error_type:8; /* 0x04 */ enum MCE_Disposition disposition:8; /* 0x05 */ - uint8_t reserved_1[2]; /* 0x06 */ + uint16_t cpu; /* 0x06 */ uint64_t gpr3; /* 0x08 */ uint64_t srr0; /* 0x10 */ uint64_t srr1; /* 0x18 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b5fec1f9751a1..25a8b20cbbdc3 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -112,6 +112,7 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->srr1 = regs->msr; mce->gpr3 = regs->gpr[3]; mce->in_use = 1; + mce->cpu = get_paca()->paca_index; /* Mark it recovered if we have handled it and MSR(RI=1). */ if (handled && (regs->msr & MSR_RI)) @@ -310,7 +311,11 @@ static void machine_check_process_queued_event(struct irq_work *work) void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { - const char *level, *sevstr, *subtype; + const char *level, *sevstr, *subtype, *err_type; + uint64_t ea = 0, pa = 0; + int n = 0; + char dar_str[50]; + char pa_str[50]; static const char *mc_ue_types[] = { "Indeterminate", "Instruction fetch", @@ -384,101 +389,103 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - evt->disposition == MCE_DISPOSITION_RECOVERED ? - "Recovered" : "Not recovered"); - - if (in_guest) { - printk("%s Guest NIP: %016llx\n", level, evt->srr0); - } else if (user_mode) { - printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, - evt->srr0, current->pid, current->comm); - } else { - printk("%s NIP [%016llx]: %pS\n", level, evt->srr0, - (void *)evt->srr0); - } - - printk("%s Initiator: %s\n", level, - evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); switch (evt->error_type) { case MCE_ERROR_TYPE_UE: + err_type = "UE"; subtype = evt->u.ue_error.ue_error_type < ARRAY_SIZE(mc_ue_types) ? mc_ue_types[evt->u.ue_error.ue_error_type] : "Unknown"; - printk("%s Error type: UE [%s]\n", level, subtype); if (evt->u.ue_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.ue_error.effective_address); + ea = evt->u.ue_error.effective_address; if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", - level, evt->u.ue_error.physical_address); + pa = evt->u.ue_error.physical_address; break; case MCE_ERROR_TYPE_SLB: + err_type = "SLB"; subtype = evt->u.slb_error.slb_error_type < ARRAY_SIZE(mc_slb_types) ? mc_slb_types[evt->u.slb_error.slb_error_type] : "Unknown"; - printk("%s Error type: SLB [%s]\n", level, subtype); if (evt->u.slb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.slb_error.effective_address); + ea = evt->u.slb_error.effective_address; break; case MCE_ERROR_TYPE_ERAT: + err_type = "ERAT"; subtype = evt->u.erat_error.erat_error_type < ARRAY_SIZE(mc_erat_types) ? mc_erat_types[evt->u.erat_error.erat_error_type] : "Unknown"; - printk("%s Error type: ERAT [%s]\n", level, subtype); if (evt->u.erat_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.erat_error.effective_address); + ea = evt->u.erat_error.effective_address; break; case MCE_ERROR_TYPE_TLB: + err_type = "TLB"; subtype = evt->u.tlb_error.tlb_error_type < ARRAY_SIZE(mc_tlb_types) ? mc_tlb_types[evt->u.tlb_error.tlb_error_type] : "Unknown"; - printk("%s Error type: TLB [%s]\n", level, subtype); if (evt->u.tlb_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.tlb_error.effective_address); + ea = evt->u.tlb_error.effective_address; break; case MCE_ERROR_TYPE_USER: + err_type = "User"; subtype = evt->u.user_error.user_error_type < ARRAY_SIZE(mc_user_types) ? mc_user_types[evt->u.user_error.user_error_type] : "Unknown"; - printk("%s Error type: User [%s]\n", level, subtype); if (evt->u.user_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.user_error.effective_address); + ea = evt->u.user_error.effective_address; break; case MCE_ERROR_TYPE_RA: + err_type = "Real address"; subtype = evt->u.ra_error.ra_error_type < ARRAY_SIZE(mc_ra_types) ? mc_ra_types[evt->u.ra_error.ra_error_type] : "Unknown"; - printk("%s Error type: Real address [%s]\n", level, subtype); if (evt->u.ra_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.ra_error.effective_address); + ea = evt->u.ra_error.effective_address; break; case MCE_ERROR_TYPE_LINK: + err_type = "Link"; subtype = evt->u.link_error.link_error_type < ARRAY_SIZE(mc_link_types) ? mc_link_types[evt->u.link_error.link_error_type] : "Unknown"; - printk("%s Error type: Link [%s]\n", level, subtype); if (evt->u.link_error.effective_address_provided) - printk("%s Effective address: %016llx\n", - level, evt->u.link_error.effective_address); + ea = evt->u.link_error.effective_address; break; default: case MCE_ERROR_TYPE_UNKNOWN: - printk("%s Error type: Unknown\n", level); + err_type = "Unknown"; + subtype = ""; break; } + + dar_str[0] = pa_str[0] = '\0'; + if (ea && evt->srr0 != ea) { + /* Load/Store address */ + n = sprintf(dar_str, "DAR: %016llx ", ea); + if (pa) + sprintf(dar_str + n, "paddr: %016llx ", pa); + } else if (pa) { + sprintf(pa_str, " paddr: %016llx", pa); + } + + printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n", + level, evt->cpu, sevstr, in_guest ? "Guest" : "Host", + err_type, subtype, dar_str, + evt->disposition == MCE_DISPOSITION_RECOVERED ? + "Recovered" : "Not recovered"); + + if (in_guest || user_mode) { + printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n", + level, evt->cpu, current->pid, current->comm, + in_guest ? "Guest " : "", evt->srr0, pa_str); + } else { + printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n", + level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); + } } EXPORT_SYMBOL_GPL(machine_check_print_event_info); From cda6618d060b5e8afc93e691d4bcd987f3dd4393 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 29 Apr 2019 23:45:55 +0530 Subject: [PATCH 072/205] powerpc/powernv/mce: Print correct severity for MCE error. Currently all machine check errors are printed as severe errors which isn't correct. Print soft errors as warning instead of severe errors. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 86 +++++++-------- arch/powerpc/kernel/mce.c | 5 +- arch/powerpc/kernel/mce_power.c | 144 ++++++++++++++------------ arch/powerpc/platforms/powernv/opal.c | 2 +- 4 files changed, 123 insertions(+), 114 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index c888ef9a3eafb..d6dc75f9e9bb2 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -31,7 +31,7 @@ enum MCE_Version { enum MCE_Severity { MCE_SEV_NO_ERROR = 0, MCE_SEV_WARNING = 1, - MCE_SEV_ERROR_SYNC = 2, + MCE_SEV_SEVERE = 2, MCE_SEV_FATAL = 3, }; @@ -110,73 +110,74 @@ enum MCE_LinkErrorType { }; struct machine_check_event { - enum MCE_Version version:8; /* 0x00 */ - uint8_t in_use; /* 0x01 */ - enum MCE_Severity severity:8; /* 0x02 */ - enum MCE_Initiator initiator:8; /* 0x03 */ - enum MCE_ErrorType error_type:8; /* 0x04 */ - enum MCE_Disposition disposition:8; /* 0x05 */ - uint16_t cpu; /* 0x06 */ - uint64_t gpr3; /* 0x08 */ - uint64_t srr0; /* 0x10 */ - uint64_t srr1; /* 0x18 */ - union { /* 0x20 */ + enum MCE_Version version:8; + u8 in_use; + enum MCE_Severity severity:8; + enum MCE_Initiator initiator:8; + enum MCE_ErrorType error_type:8; + enum MCE_Disposition disposition:8; + bool sync_error; + u16 cpu; + u64 gpr3; + u64 srr0; + u64 srr1; + union { struct { enum MCE_UeErrorType ue_error_type:8; - uint8_t effective_address_provided; - uint8_t physical_address_provided; - uint8_t reserved_1[5]; - uint64_t effective_address; - uint64_t physical_address; - uint8_t reserved_2[8]; + u8 effective_address_provided; + u8 physical_address_provided; + u8 reserved_1[5]; + u64 effective_address; + u64 physical_address; + u8 reserved_2[8]; } ue_error; struct { enum MCE_SlbErrorType slb_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } slb_error; struct { enum MCE_EratErrorType erat_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } erat_error; struct { enum MCE_TlbErrorType tlb_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } tlb_error; struct { enum MCE_UserErrorType user_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } user_error; struct { enum MCE_RaErrorType ra_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } ra_error; struct { enum MCE_LinkErrorType link_error_type:8; - uint8_t effective_address_provided; - uint8_t reserved_1[6]; - uint64_t effective_address; - uint8_t reserved_2[16]; + u8 effective_address_provided; + u8 reserved_1[6]; + u64 effective_address; + u8 reserved_2[16]; } link_error; } u; }; @@ -194,6 +195,7 @@ struct mce_error_info { } u; enum MCE_Severity severity:8; enum MCE_Initiator initiator:8; + bool sync_error; }; #define MAX_MC_EVT 100 diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 25a8b20cbbdc3..71d245a387abd 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -122,6 +122,7 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->initiator = mce_err->initiator; mce->severity = mce_err->severity; + mce->sync_error = mce_err->sync_error; /* * Populate the mce error_type and type-specific error_type. @@ -376,9 +377,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; case MCE_SEV_WARNING: level = KERN_WARNING; - sevstr = ""; + sevstr = "Warning"; break; - case MCE_SEV_ERROR_SYNC: + case MCE_SEV_SEVERE: level = KERN_ERR; sevstr = "Severe"; break; diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 367fbfa2e835d..6647a31b85b29 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -133,106 +133,107 @@ struct mce_ierror_table { unsigned int error_subtype; unsigned int initiator; unsigned int severity; + bool sync_error; }; static const struct mce_ierror_table mce_p7_ierror_table[] = { { 0x00000000001c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x00000000000c0000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000140000, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x00000000001c0000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, 0, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, 0, 0, 0, 0, 0, 0 } }; static const struct mce_ierror_table mce_p8_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000000c0000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000140000, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000001c0000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008040000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, 0, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, 0, 0, 0, 0, 0, 0 } }; static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000000080000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000000c0000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000140000, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000001c0000, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008040000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000080c0000, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008100000, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008140000, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, - MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ + MCE_INITIATOR_CPU, MCE_SEV_FATAL, false }, /* ASYNC is fatal */ { 0x00000000081c0000, 0x0000000008180000, false, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ + MCE_INITIATOR_CPU, MCE_SEV_FATAL, false }, /* ASYNC is fatal */ { 0x00000000081c0000, 0x00000000081c0000, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, 0, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, 0, 0, 0, 0, 0, 0 } }; struct mce_derror_table { unsigned long dsisr_value; @@ -241,103 +242,104 @@ struct mce_derror_table { unsigned int error_subtype; unsigned int initiator; unsigned int severity; + bool sync_error; }; static const struct mce_derror_table mce_p7_derror_table[] = { { 0x00008000, false, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000040, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, false, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, +{ 0, false, 0, 0, 0, 0, 0 } }; static const struct mce_derror_table mce_p8_derror_table[] = { { 0x00008000, false, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00002000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00001000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000200, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, false, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, false, 0, 0, 0, 0, 0 } }; static const struct mce_derror_table mce_p9_derror_table[] = { { 0x00008000, false, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00002000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00001000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000200, false, MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000040, true, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000020, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000010, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000008, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, - MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -{ 0, false, 0, 0, 0, 0 } }; + MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, +{ 0, false, 0, 0, 0, 0, 0 } }; static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, uint64_t *phys_addr) @@ -427,11 +429,12 @@ static int mce_handle_ierror(struct pt_regs *regs, mce_err->u.link_error_type = table[i].error_subtype; break; } + mce_err->sync_error = table[i].sync_error; mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; if (table[i].nip_valid) { *addr = regs->nip; - if (mce_err->severity == MCE_SEV_ERROR_SYNC && + if (mce_err->sync_error && table[i].error_type == MCE_ERROR_TYPE_UE) { unsigned long pfn; @@ -448,8 +451,9 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; - mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->severity = MCE_SEV_SEVERE; mce_err->initiator = MCE_INITIATOR_CPU; + mce_err->sync_error = true; return 0; } @@ -519,11 +523,12 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->u.link_error_type = table[i].error_subtype; break; } + mce_err->sync_error = table[i].sync_error; mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + else if (mce_err->sync_error && table[i].error_type == MCE_ERROR_TYPE_UE) { /* * We do a maximum of 4 nested MCE calls, see @@ -539,8 +544,9 @@ static int mce_handle_derror(struct pt_regs *regs, return handled; mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; - mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->severity = MCE_SEV_SEVERE; mce_err->initiator = MCE_INITIATOR_CPU; + mce_err->sync_error = true; return 0; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2b0eca104f86a..737c51d634801 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -505,7 +505,7 @@ static int opal_recover_mce(struct pt_regs *regs, recovered = 0; } - if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) { + if (!recovered && evt->sync_error) { /* * Try to kill processes if we get a synchronous machine check * (e.g., one caused by execution of this instruction). This From 50dbabe06a6e1c35980154ea1fac2ed6ad28652b Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Mon, 29 Apr 2019 23:46:02 +0530 Subject: [PATCH 073/205] powerpc/powernv/mce: Print additional information about MCE error. Print more information about MCE error whether it is an hardware or software error. Some of the MCE errors can be easily categorized as hardware or software errors e.g. UEs are due to hardware error, where as error triggered due to invalid usage of tlbie is a pure software bug. But not all the MCE errors can be easily categorize into either software or hardware. There are errors like multihit errors which are usually result of a software bug, but in some rare cases a hardware failure can cause a multihit error. In past, we have seen case where after replacing faulty chip, multihit errors stopped occurring. Same with parity errors, which are usually due to faulty hardware but there are chances where multihit can also cause an parity error. Such errors are difficult to determine what really caused it. Hence this patch classifies MCE errors into following four categorize: 1. Hardware error: UE and Link timeout failure errors. 2. Probable hardware error (some chance of software cause) SLB/ERAT/TLB Parity errors. 3. Software error Invalid tlbie form. 4. Probable software error (some chance of hardware cause) SLB/ERAT/TLB Multihit errors. Sample output: MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered] MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60] MCE: CPU80: Probable Software error (some chance of hardware cause) Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 10 +++ arch/powerpc/kernel/mce.c | 12 ++++ arch/powerpc/kernel/mce_power.c | 107 +++++++++++++++++++------------- 3 files changed, 86 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index d6dc75f9e9bb2..23247a132ce86 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -56,6 +56,14 @@ enum MCE_ErrorType { MCE_ERROR_TYPE_LINK = 7, }; +enum MCE_ErrorClass { + MCE_ECLASS_UNKNOWN = 0, + MCE_ECLASS_HARDWARE, + MCE_ECLASS_HARD_INDETERMINATE, + MCE_ECLASS_SOFTWARE, + MCE_ECLASS_SOFT_INDETERMINATE, +}; + enum MCE_UeErrorType { MCE_UE_ERROR_INDETERMINATE = 0, MCE_UE_ERROR_IFETCH = 1, @@ -115,6 +123,7 @@ struct machine_check_event { enum MCE_Severity severity:8; enum MCE_Initiator initiator:8; enum MCE_ErrorType error_type:8; + enum MCE_ErrorClass error_class:8; enum MCE_Disposition disposition:8; bool sync_error; u16 cpu; @@ -195,6 +204,7 @@ struct mce_error_info { } u; enum MCE_Severity severity:8; enum MCE_Initiator initiator:8; + enum MCE_ErrorClass error_class:8; bool sync_error; }; diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 71d245a387abd..4581377cfc986 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -123,6 +123,7 @@ void save_mce_event(struct pt_regs *regs, long handled, mce->initiator = mce_err->initiator; mce->severity = mce_err->severity; mce->sync_error = mce_err->sync_error; + mce->error_class = mce_err->error_class; /* * Populate the mce error_type and type-specific error_type. @@ -363,6 +364,13 @@ void machine_check_print_event_info(struct machine_check_event *evt, "Store (timeout)", "Page table walk Load/Store (timeout)", }; + static const char *mc_error_class[] = { + "Unknown", + "Hardware error", + "Probable Hardware error (some chance of software cause)", + "Software error", + "Probable Software error (some chance of hardware cause)", + }; /* Print things out */ if (evt->version != MCE_V1) { @@ -487,6 +495,10 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n", level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); } + + subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? + mc_error_class[evt->error_class] : "Unknown"; + printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); } EXPORT_SYMBOL_GPL(machine_check_print_event_info); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 6647a31b85b29..b5e876efe864a 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -131,6 +131,7 @@ struct mce_ierror_table { bool nip_valid; /* nip is a valid indicator of faulting address */ unsigned int error_type; unsigned int error_subtype; + unsigned int error_class; unsigned int initiator; unsigned int severity; bool sync_error; @@ -138,99 +139,103 @@ struct mce_ierror_table { static const struct mce_ierror_table mce_p7_ierror_table[] = { { 0x00000000001c0000, 0x0000000000040000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x0000000000080000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000100000, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000140000, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000001c0000, 0x0000000000180000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000001c0000, 0x00000000001c0000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, 0, 0, 0, 0, 0, 0 } }; static const struct mce_ierror_table mce_p8_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000000080000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000100000, true, - MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000140000, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000001c0000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008000000, true, - MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008040000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, 0, 0, 0, 0, 0, 0 } }; static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000040000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000000080000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000000c0000, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000100000, true, - MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000140000, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000000081c0000, 0x0000000000180000, true, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000001c0000, true, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, MCE_ECLASS_SOFTWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008000000, true, - MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008040000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x00000000080c0000, true, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, MCE_ECLASS_SOFTWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008100000, true, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_ECLASS_SOFTWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000000081c0000, 0x0000000008140000, false, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_FATAL, false }, /* ASYNC is fatal */ { 0x00000000081c0000, 0x0000000008180000, false, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_FATAL, false }, /* ASYNC is fatal */ -{ 0x00000000081c0000, 0x00000000081c0000, true, +{ 0x00000000081c0000, 0x00000000081c0000, true, MCE_ECLASS_HARDWARE, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, 0, 0, 0, 0, 0, 0 } }; @@ -240,6 +245,7 @@ struct mce_derror_table { bool dar_valid; /* dar is a valid indicator of faulting address */ unsigned int error_type; unsigned int error_subtype; + unsigned int error_class; unsigned int initiator; unsigned int severity; bool sync_error; @@ -247,97 +253,108 @@ struct mce_derror_table { static const struct mce_derror_table mce_p7_derror_table[] = { { 0x00008000, false, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, - MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000040, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0, false, 0, 0, 0, 0, 0 } }; static const struct mce_derror_table mce_p8_derror_table[] = { { 0x00008000, false, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00002000, true, - MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00001000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, - MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000200, true, MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ + MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; static const struct mce_derror_table mce_p9_derror_table[] = { { 0x00008000, false, - MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00004000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00002000, true, - MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00001000, true, MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000800, true, - MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000400, true, - MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000200, false, - MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, + MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, MCE_ECLASS_SOFTWARE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000080, true, MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */ + MCE_ECLASS_SOFT_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_WARNING, true }, { 0x00000100, true, - MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, MCE_ECLASS_HARD_INDETERMINATE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000040, true, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000020, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000010, false, MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, + MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0x00000008, false, - MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, MCE_ECLASS_HARDWARE, MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; @@ -406,6 +423,7 @@ static int mce_handle_ierror(struct pt_regs *regs, /* now fill in mce_error_info */ mce_err->error_type = table[i].error_type; + mce_err->error_class = table[i].error_class; switch (table[i].error_type) { case MCE_ERROR_TYPE_UE: mce_err->u.ue_error_type = table[i].error_subtype; @@ -451,6 +469,7 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->error_class = MCE_ECLASS_UNKNOWN; mce_err->severity = MCE_SEV_SEVERE; mce_err->initiator = MCE_INITIATOR_CPU; mce_err->sync_error = true; @@ -500,6 +519,7 @@ static int mce_handle_derror(struct pt_regs *regs, /* now fill in mce_error_info */ mce_err->error_type = table[i].error_type; + mce_err->error_class = table[i].error_class; switch (table[i].error_type) { case MCE_ERROR_TYPE_UE: mce_err->u.ue_error_type = table[i].error_subtype; @@ -544,6 +564,7 @@ static int mce_handle_derror(struct pt_regs *regs, return handled; mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->error_class = MCE_ECLASS_UNKNOWN; mce_err->severity = MCE_SEV_SEVERE; mce_err->initiator = MCE_INITIATOR_CPU; mce_err->sync_error = true; From 2c474c03505677cfd987d52e8bf42abe8c270529 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 30 Apr 2019 13:29:07 +0530 Subject: [PATCH 074/205] powerpc/mm/radix: Fix kernel crash when running subpage protect test This patch fixes the below crash by making sure we touch the subpage protection related structures only if we know they are allocated on the platform. With radix translation we don't allocate hash context at all and trying to access subpage_prot_table results in: Faulting instruction address: 0xc00000000008bdb4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV .... NIP [c00000000008bdb4] sys_subpage_prot+0x74/0x590 LR [c00000000000b688] system_call+0x5c/0x70 Call Trace: [c00020002c6b7d30] [c00020002c6b7d90] 0xc00020002c6b7d90 (unreliable) [c00020002c6b7e20] [c00000000000b688] system_call+0x5c/0x70 Instruction dump: fb61ffd8 fb81ffe0 fba1ffe8 fbc1fff0 fbe1fff8 f821ff11 e92d1178 f9210068 39200000 e92d0968 ebe90630 e93f03e8 60000000 3860fffe e9410068 We also move the subpage_prot_table with mmp_sem held to avoid race between two parallel subpage_prot syscall. Fixes: 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix") Reported-by: Sachin Sant Signed-off-by: Aneesh Kumar K.V Tested-by: Sachin Sant Signed-off-by: Michael Ellerman --- arch/powerpc/mm/subpage-prot.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index c9dff4e1f295c..473dd430e3063 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -90,16 +90,18 @@ static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, static void subpage_prot_clear(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; unsigned long next, limit; + down_write(&mm->mmap_sem); + + spt = mm_ctx_subpage_prot(&mm->context); if (!spt) - return ; + goto err_out; - down_write(&mm->mmap_sem); limit = addr + len; if (limit > spt->maxaddr) limit = spt->maxaddr; @@ -127,6 +129,8 @@ static void subpage_prot_clear(unsigned long addr, unsigned long len) /* now flush any existing HPTEs for the range */ hpte_flush_range(mm, addr, nw); } + +err_out: up_write(&mm->mmap_sem); } @@ -189,7 +193,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, unsigned long, len, u32 __user *, map) { struct mm_struct *mm = current->mm; - struct subpage_prot_table *spt = mm_ctx_subpage_prot(&mm->context); + struct subpage_prot_table *spt; u32 **spm, *spp; unsigned long i; size_t nw; @@ -219,6 +223,7 @@ SYSCALL_DEFINE3(subpage_prot, unsigned long, addr, down_write(&mm->mmap_sem); + spt = mm_ctx_subpage_prot(&mm->context); if (!spt) { /* * Allocate subpage prot table if not already done. From e620d45065c7b5b8d6ae11217c09c09380103b83 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 16 Jan 2019 14:47:44 -0200 Subject: [PATCH 075/205] powerpc/tm: Avoid machine crash on rt_sigreturn() There is a kernel crash that happens if rt_sigreturn() is called inside a transactional block. This crash happens if the kernel hits an in-kernel page fault when accessing userspace memory, usually through copy_ckvsx_to_user(). A major page fault calls might_sleep() function, which can cause a task reschedule. A task reschedule (switch_to()) reclaim and recheckpoint the TM states, but, in the signal return path, the checkpointed memory was already reclaimed, thus the exception stack has MSR that points to MSR[TS]=0. When the code returns from might_sleep() and a task reschedule happened, then this task is returned with the memory recheckpointed, and CPU MSR[TS] = suspended. This means that there is a side effect at might_sleep() if it is called with CPU MSR[TS] = 0 and the task has regs->msr[TS] != 0. This side effect can cause a TM bad thing, since at the exception entrance, the stack saves MSR[TS]=0, and this is what will be used at RFID, but, the processor has MSR[TS] = Suspended, and this transition will be invalid and a TM Bad thing will be raised, causing the following crash: Unexpected TM Bad Thing exception at c00000000000e9ec (msr 0x8000000302a03031) tm_scratch=800000010280b033 cpu 0xc: Vector: 700 (Program Check) at [c00000003ff1fd70] pc: c00000000000e9ec: fast_exception_return+0x100/0x1bc lr: c000000000032948: handle_rt_signal64+0xb8/0xaf0 sp: c0000004263ebc40 msr: 8000000302a03031 current = 0xc000000415050300 paca = 0xc00000003ffc4080 irqmask: 0x03 irq_happened: 0x01 pid = 25006, comm = sigfuz Linux version 5.0.0-rc1-00001-g3bd6e94bec12 (breno@debian) (gcc version 8.2.0 (Debian 8.2.0-3)) #899 SMP Mon Jan 7 11:30:07 EST 2019 WARNING: exception is not recoverable, can't continue enter ? for help [c0000004263ebc40] c000000000032948 handle_rt_signal64+0xb8/0xaf0 (unreliable) [c0000004263ebd30] c000000000022780 do_notify_resume+0x2f0/0x430 [c0000004263ebe20] c00000000000e844 ret_from_except_lite+0x70/0x74 --- Exception: c00 (System Call) at 00007fffbaac400c SP (7fffeca90f40) is in userspace The solution for this problem is running the sigreturn code with regs->msr[TS] disabled, thus, avoiding hitting the side effect above. This does not seem to be a problem since regs->msr will be replaced by the ucontext value, so, it is being flushed already. In this case, it is flushed earlier. Signed-off-by: Breno Leitao Acked-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/signal_64.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 6794466f64200..06c299ef61322 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -565,7 +565,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, preempt_disable(); /* pull in MSR TS bits from user context */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + regs->msr |= msr & MSR_TS_MASK; /* * Ensure that TM is enabled in regs->msr before we leave the signal @@ -745,6 +745,31 @@ SYSCALL_DEFINE0(rt_sigreturn) if (MSR_TM_SUSPENDED(mfmsr())) tm_reclaim_current(0); + /* + * Disable MSR[TS] bit also, so, if there is an exception in the + * code below (as a page fault in copy_ckvsx_to_user()), it does + * not recheckpoint this task if there was a context switch inside + * the exception. + * + * A major page fault can indirectly call schedule(). A reschedule + * process in the middle of an exception can have a side effect + * (Changing the CPU MSR[TS] state), since schedule() is called + * with the CPU MSR[TS] disable and returns with MSR[TS]=Suspended + * (switch_to() calls tm_recheckpoint() for the 'new' process). In + * this case, the process continues to be the same in the CPU, but + * the CPU state just changed. + * + * This can cause a TM Bad Thing, since the MSR in the stack will + * have the MSR[TS]=0, and this is what will be used to RFID. + * + * Clearing MSR[TS] state here will avoid a recheckpoint if there + * is any process reschedule in kernel space. The MSR[TS] state + * does not need to be saved also, since it will be replaced with + * the MSR[TS] that came from user context later, at + * restore_tm_sigcontexts. + */ + regs->msr &= ~MSR_TS_MASK; + if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) goto badframe; if (MSR_TM_ACTIVE(msr)) { From a1ac2a9c4f98482e49305ab5551b7b32f9cac39b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Mar 2019 13:03:45 +0000 Subject: [PATCH 076/205] powerpc/book3e: drop BUG_ON() in map_kernel_page() early_alloc_pgtable() never returns NULL as it panics on failure. This patch drops the three BUG_ON() which check the non nullity of early_alloc_pgtable() returned value. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-book3e.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index 1032ef7aaf62a..390a6d0b216d2 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -98,20 +98,17 @@ int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) #ifndef __PAGETABLE_PUD_FOLDED if (pgd_none(*pgdp)) { pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); pgd_populate(&init_mm, pgdp, pudp); } #endif /* !__PAGETABLE_PUD_FOLDED */ pudp = pud_offset(pgdp, ea); if (pud_none(*pudp)) { pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (!pmd_present(*pmdp)) { ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); pmd_populate_kernel(&init_mm, pmdp, ptep); } ptep = pte_offset_kernel(pmdp, ea); From 71faf8145cdc20f22aa398eb7b206b33826cf2bd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 28 Mar 2019 13:19:47 +0000 Subject: [PATCH 077/205] powerpc/nohash64: clean pgtable.h TRANSPARENT_HUGEPAGE is only supported by book3s VMEMMAP_REGION_ID is never used Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/64/pgtable.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 0384a3302fb60..c8e6a9a5bc33f 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -23,11 +23,7 @@ PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) -#else #define PMD_CACHE_INDEX PMD_INDEX_SIZE -#endif #define PUD_CACHE_INDEX PUD_INDEX_SIZE /* @@ -73,7 +69,6 @@ #define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) /* From 9d9f2cccde952126185e3336af0d4dc62eb254ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 09:59:59 +0000 Subject: [PATCH 078/205] powerpc/mm: change #include "mmu_decl.h" to This patch make inclusion of mmu_decl.h independant of the location of the file including it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/40x_mmu.c | 2 +- arch/powerpc/mm/44x_mmu.c | 2 +- arch/powerpc/mm/8xx_mmu.c | 2 +- arch/powerpc/mm/dma-noncoherent.c | 2 +- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- arch/powerpc/mm/init_32.c | 2 +- arch/powerpc/mm/init_64.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/powerpc/mm/mmu_context_nohash.c | 2 +- arch/powerpc/mm/pgtable-book3e.c | 2 +- arch/powerpc/mm/pgtable-book3s64.c | 2 +- arch/powerpc/mm/pgtable-hash64.c | 2 +- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/ppc_mmu_32.c | 2 +- arch/powerpc/mm/tlb_hash32.c | 2 +- arch/powerpc/mm/tlb_nohash.c | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index b9cf6f8764b00..460459b6f53ee 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -49,7 +49,7 @@ #include #include -#include "mmu_decl.h" +#include extern int __map_without_ltlbs; /* diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index aad127acdbaaa..c07983ebc02e3 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -31,7 +31,7 @@ #include #include -#include "mmu_decl.h" +#include /* Used by the 44x TLB replacement exception handler. * Just needed it declared someplace. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index 87648b58d2958..70d55b615b628 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -17,7 +17,7 @@ #include #include -#include "mmu_decl.h" +#include #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index b5d2658c26afb..2f6154b763286 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -36,7 +36,7 @@ #include #include -#include "mmu_decl.h" +#include /* * This address range defaults to a value that is safe for all diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 210cbc1faf638..71a1a36751dde 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -54,7 +54,7 @@ #include #include -#include "mmu_decl.h" +#include unsigned int tlbcam_index; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 80cc97cd88782..3eb4cb09749c6 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -47,7 +47,7 @@ #include #include -#include "mmu_decl.h" +#include #if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) /* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a4c155af15975..45b02fa11cd81 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,7 +66,7 @@ #include #include -#include "mmu_decl.h" +#include phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e12bec98366fa..105c58f8900aa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -54,7 +54,7 @@ #include #include -#include "mmu_decl.h" +#include #ifndef CPU_FTR_COHERENT_ICACHE #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 1945c5f19f5ef..ae4505d5b4b87 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -52,7 +52,7 @@ #include #include -#include "mmu_decl.h" +#include /* * The MPC8xx has only 16 contexts. We rotate through them on each task switch. diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index 390a6d0b216d2..f296c2e88b09c 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -15,7 +15,7 @@ #include #include -#include "mmu_decl.h" +#include #ifdef CONFIG_SPARSEMEM_VMEMMAP /* diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index a4341aba0af4d..16bda049187ab 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -17,7 +17,7 @@ #include #include -#include "mmu_decl.h" +#include #include unsigned long __pmd_frag_nr; diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 097a3b3538b15..1fd025dba4a3a 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -19,7 +19,7 @@ #include #include -#include "mmu_decl.h" +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 6e56a6240bfa4..c9cdbb84d31f8 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -36,7 +36,7 @@ #include #include -#include "mmu_decl.h" +#include unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 95ad2a09501c8..95ed765194115 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -52,7 +52,7 @@ #include #include -#include "mmu_decl.h" +#include #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index bf1de3ca39bce..1db55159031ce 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -34,7 +34,7 @@ #include #include -#include "mmu_decl.h" +#include struct hash_pte *Hash, *Hash_end; unsigned long Hash_size, Hash_mask; diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index cf8472cf3d593..8d56f0417f871 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -32,7 +32,7 @@ #include #include -#include "mmu_decl.h" +#include /* * Called when unmapping pages to flush entries from the TLB/hash table. diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 088e0a6b5adea..704e613a0b144 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -46,7 +46,7 @@ #include #include -#include "mmu_decl.h" +#include /* * This struct lists the sw-supported page sizes. The hardawre MMU may support From 47d99948eee48a84a4b242c17915a4ff59a29b5d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:00 +0000 Subject: [PATCH 079/205] powerpc/mm: Move book3s64 specifics in subdirectory mm/book3s64 Many files in arch/powerpc/mm are only for book3S64. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Update the selftest sym links, shorten new filenames, cleanup some whitespace and formatting in the new files.] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 25 +------- arch/powerpc/mm/book3s64/Makefile | 24 +++++++ .../mm/{hash64_4k.c => book3s64/hash_4k.c} | 2 +- .../mm/{hash64_64k.c => book3s64/hash_64k.c} | 2 +- .../hash_hugepage.c} | 2 +- .../hash_hugetlbpage.c} | 15 +++-- .../hash_native.c} | 0 .../hash_pgtable.c} | 0 .../mm/{tlb_hash64.c => book3s64/hash_tlb.c} | 18 ++++-- .../hash_utils.c} | 50 ++++++++++----- .../iommu_api.c} | 0 .../mmu_context.c} | 0 .../pgtable.c} | 0 arch/powerpc/mm/{ => book3s64}/pkeys.c | 0 .../radix_hugetlbpage.c} | 0 .../radix_pgtable.c} | 62 +++++++++---------- .../mm/{tlb-radix.c => book3s64/radix_tlb.c} | 0 arch/powerpc/mm/{ => book3s64}/slb.c | 3 +- .../subpage_prot.c} | 0 arch/powerpc/mm/{ => book3s64}/vphn.c | 6 +- arch/powerpc/mm/{ => book3s64}/vphn.h | 3 +- arch/powerpc/mm/numa.c | 2 +- tools/testing/selftests/powerpc/vphn/vphn.c | 2 +- tools/testing/selftests/powerpc/vphn/vphn.h | 2 +- 24 files changed, 126 insertions(+), 92 deletions(-) create mode 100644 arch/powerpc/mm/book3s64/Makefile rename arch/powerpc/mm/{hash64_4k.c => book3s64/hash_4k.c} (98%) rename arch/powerpc/mm/{hash64_64k.c => book3s64/hash_64k.c} (99%) rename arch/powerpc/mm/{hugepage-hash64.c => book3s64/hash_hugepage.c} (98%) rename arch/powerpc/mm/{hugetlbpage-hash64.c => book3s64/hash_hugetlbpage.c} (93%) rename arch/powerpc/mm/{hash_native_64.c => book3s64/hash_native.c} (100%) rename arch/powerpc/mm/{pgtable-hash64.c => book3s64/hash_pgtable.c} (100%) rename arch/powerpc/mm/{tlb_hash64.c => book3s64/hash_tlb.c} (95%) rename arch/powerpc/mm/{hash_utils_64.c => book3s64/hash_utils.c} (98%) rename arch/powerpc/mm/{mmu_context_iommu.c => book3s64/iommu_api.c} (100%) rename arch/powerpc/mm/{mmu_context_book3s64.c => book3s64/mmu_context.c} (100%) rename arch/powerpc/mm/{pgtable-book3s64.c => book3s64/pgtable.c} (100%) rename arch/powerpc/mm/{ => book3s64}/pkeys.c (100%) rename arch/powerpc/mm/{hugetlbpage-radix.c => book3s64/radix_hugetlbpage.c} (100%) rename arch/powerpc/mm/{pgtable-radix.c => book3s64/radix_pgtable.c} (96%) rename arch/powerpc/mm/{tlb-radix.c => book3s64/radix_tlb.c} (100%) rename arch/powerpc/mm/{ => book3s64}/slb.c (99%) rename arch/powerpc/mm/{subpage-prot.c => book3s64/subpage_prot.c} (100%) rename arch/powerpc/mm/{ => book3s64}/vphn.c (94%) rename arch/powerpc/mm/{ => book3s64}/vphn.h (98%) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3c1bd9fa23cd9..a137fdf775e25 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,53 +5,34 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb.o \ - $(hash64-y) mmu_context_book3s64.o \ - pgtable-book3s64.o pgtable-frag.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S) += tlb_hash$(BITS).o -ifdef CONFIG_PPC_BOOK3S_64 -obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o -obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o -endif +obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o -obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o -obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif -obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o -obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ -obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o # Disable kcov instrumentation on sensitive code # This is necessary for booting with kcov enabled on book3e machines KCOV_INSTRUMENT_tlb_nohash.o := n KCOV_INSTRUMENT_fsl_booke_mmu.o := n - -# Instrumenting the SLB fault path can lead to duplicate SLB entries -KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile new file mode 100644 index 0000000000000..974b4fc19f4f1 --- /dev/null +++ b/arch/powerpc/mm/book3s64/Makefile @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) + +obj-y += hash_pgtable.o hash_utils.o slb.o \ + mmu_context.o pgtable.o hash_tlb.o +obj-$(CONFIG_PPC_NATIVE) += hash_native.o +obj-$(CONFIG_PPC_RADIX_MMU) += radix_pgtable.o radix_tlb.o +obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o +obj-$(CONFIG_PPC_64K_PAGES) += hash_64k.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_HUGETLB_PAGE) += hash_hugetlbpage.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_RADIX_MMU) += radix_hugetlbpage.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hash_hugepage.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage_prot.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += iommu_api.o +obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Instrumenting the SLB fault path can lead to duplicate SLB entries +KCOV_INSTRUMENT_slb.o := n diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c similarity index 98% rename from arch/powerpc/mm/hash64_4k.c rename to arch/powerpc/mm/book3s64/hash_4k.c index 6fa6765a10eb3..22e787123cdf8 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V + * Author Aneesh Kumar K.V * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c similarity index 99% rename from arch/powerpc/mm/hash64_64k.c rename to arch/powerpc/mm/book3s64/hash_64k.c index 3afa253d7f52a..7084ce2951e64 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2015 - * Author Aneesh Kumar K.V + * Author Aneesh Kumar K.V * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugepage.c similarity index 98% rename from arch/powerpc/mm/hugepage-hash64.c rename to arch/powerpc/mm/book3s64/hash_hugepage.c index dfbc3b32f09b8..440823797de73 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_hugepage.c @@ -1,6 +1,6 @@ /* * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V + * Author Aneesh Kumar K.V * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c similarity index 93% rename from arch/powerpc/mm/hugetlbpage-hash64.c rename to arch/powerpc/mm/book3s64/hash_hugetlbpage.c index b0d9209d9a86f..2d4e02aa15a37 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -34,7 +34,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* Search the Linux page table for a match with va */ vpn = hpt_vpn(ea, vsid, ssize); - /* At this point, we have a pte (old_pte) which can be used to build + /* + * At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: * * 1. There is a valid (present) pte with no associated HPTE (this is @@ -55,8 +56,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(!check_pte_access(access, old_pte))) return 1; - /* Try to lock the PTE, add ACCESSED and DIRTY if it was - * a write access */ + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; @@ -74,8 +77,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rpte = __real_pte(__pte(old_pte), ptep, offset); if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/book3s64/hash_native.c similarity index 100% rename from arch/powerpc/mm/hash_native_64.c rename to arch/powerpc/mm/book3s64/hash_native.c diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/book3s64/hash_pgtable.c similarity index 100% rename from arch/powerpc/mm/pgtable-hash64.c rename to arch/powerpc/mm/book3s64/hash_pgtable.c diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/book3s64/hash_tlb.c similarity index 95% rename from arch/powerpc/mm/tlb_hash64.c rename to arch/powerpc/mm/book3s64/hash_tlb.c index 87d71dd254410..d4f0101447b1a 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -55,7 +55,8 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, i = batch->index; - /* Get page size (maybe move back to caller). + /* + * Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like * for SPEs, we obtain the page size from the slice, which thus @@ -77,10 +78,12 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, #endif } else { psize = pte_pagesize_index(mm, addr, pte); - /* Mask the address for the standard page size. If we + /* + * Mask the address for the standard page size. If we * have a 64k page kernel, but the hardware does not * support 64k pages, this might be different from the - * hardware page size encoded in the slice table. */ + * hardware page size encoded in the slice table. + */ addr &= PAGE_MASK; offset = PTRS_PER_PTE; } @@ -161,7 +164,8 @@ void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); - /* If there's a TLB batch pending, then we must flush it because the + /* + * If there's a TLB batch pending, then we must flush it because the * pages are going to be freed and we really don't want to have a CPU * access a freed page because it has a stale TLB */ @@ -201,7 +205,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, BUG_ON(!mm->pgd); - /* Note: Normally, we should only ever use a batch within a + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference @@ -238,7 +243,8 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) unsigned long flags; addr = _ALIGN_DOWN(addr, PMD_SIZE); - /* Note: Normally, we should only ever use a batch within a + /* + * Note: Normally, we should only ever use a batch within a * PTE locked section. This violates the rule, but will work * since we don't actually modify the PTEs, we just flush the * hash while leaving the PTEs intact (including their reference diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/book3s64/hash_utils.c similarity index 98% rename from arch/powerpc/mm/hash_utils_64.c rename to arch/powerpc/mm/book3s64/hash_utils.c index 6eb89643ce583..b21a81d42f152 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -128,7 +128,8 @@ static DEFINE_SPINLOCK(linear_map_hash_lock); struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); -/* There are definitions of page sizes arrays to be used when none +/* + * These are definitions of page sizes arrays to be used when none * is provided by the firmware. */ @@ -145,7 +146,8 @@ static struct mmu_psize_def mmu_psize_defaults[] = { }, }; -/* POWER4, GPUL, POWER5 +/* + * POWER4, GPUL, POWER5 * * Support for 16Mb large pages */ @@ -479,7 +481,8 @@ static int __init htab_dt_scan_page_sizes(unsigned long node, } #ifdef CONFIG_HUGETLB_PAGE -/* Scan for 16G memory blocks that have been set aside for huge pages +/* + * Scan for 16G memory blocks that have been set aside for huge pages * and reserve those blocks for 16G huge pages. */ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, @@ -496,8 +499,10 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, if (type == NULL || strcmp(type, "memory") != 0) return 0; - /* This property is the log base 2 of the number of virtual pages that - * will represent this memory block. */ + /* + * This property is the log base 2 of the number of virtual pages that + * will represent this memory block. + */ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); if (page_count_prop == NULL) return 0; @@ -673,7 +678,8 @@ static void __init htab_init_page_sizes(void) #endif /* CONFIG_PPC_64K_PAGES */ #ifdef CONFIG_SPARSEMEM_VMEMMAP - /* We try to use 16M pages for vmemmap if that is supported + /* + * We try to use 16M pages for vmemmap if that is supported * and we have at least 1G of RAM at boot */ if (mmu_psize_defs[MMU_PAGE_16M].shift && @@ -742,7 +748,8 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size) static unsigned long __init htab_get_table_size(void) { - /* If hash size isn't already provided by the platform, we try to + /* + * If hash size isn't already provided by the platform, we try to * retrieve it from the device-tree. If it's not there neither, we * calculate it now based on the total RAM size */ @@ -1043,7 +1050,8 @@ void __init hash__early_init_mmu(void) if (!mmu_hash_ops.hpte_insert) panic("hash__early_init_mmu: No MMU hash ops defined!\n"); - /* Initialize the MMU Hash table and create the linear mapping + /* + * Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. */ @@ -1228,7 +1236,8 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, } } -/* Result code is: +/* + * Result code is: * 0 - handled * 1 - normal page fault * -1 - critical hash insertion error @@ -1276,8 +1285,9 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, ssize = mmu_kernel_ssize; break; default: - /* Not a valid range - * Send the problem up to do_page_fault + /* + * Not a valid range + * Send the problem up to do_page_fault() */ rc = 1; goto bail; @@ -1302,7 +1312,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, flags |= HPTE_LOCAL_UPDATE; #ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we might + /* + * If we use 4K pages and our psize is not 4K, then we might * be hitting a special driver mapping, and need to align the * address before we fetch the PTE. * @@ -1324,7 +1335,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Add _PAGE_PRESENT to the required access perm */ access |= _PAGE_PRESENT; - /* Pre-check access permissions (will be re-checked atomically + /* + * Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ if (!check_pte_access(access, pte_val(*ptep))) { @@ -1371,7 +1383,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, psize = MMU_PAGE_4K; } - /* If this PTE is non-cacheable and we have restrictions on + /* + * If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { @@ -1412,7 +1425,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, flags, ssize, spp); } - /* Dump some info in case of hash insertion failure, they should + /* + * Dump some info in case of hash insertion failure, they should * never happen so it is really useful to know if/when they do */ if (rc == -1) @@ -1653,7 +1667,8 @@ unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift, return gslot; } -/* WARNING: This is called from hash_low_64.S, if you change this prototype, +/* + * WARNING: This is called from hash_low_64.S, if you change this prototype, * do not forget to update the assembly call site ! */ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, @@ -1874,7 +1889,8 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* We don't currently support the first MEMBLOCK not mapping 0 + /* + * We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors */ BUG_ON(first_memblock_base != 0); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/book3s64/iommu_api.c similarity index 100% rename from arch/powerpc/mm/mmu_context_iommu.c rename to arch/powerpc/mm/book3s64/iommu_api.c diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/book3s64/mmu_context.c similarity index 100% rename from arch/powerpc/mm/mmu_context_book3s64.c rename to arch/powerpc/mm/book3s64/mmu_context.c diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/book3s64/pgtable.c similarity index 100% rename from arch/powerpc/mm/pgtable-book3s64.c rename to arch/powerpc/mm/book3s64/pgtable.c diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c similarity index 100% rename from arch/powerpc/mm/pkeys.c rename to arch/powerpc/mm/book3s64/pkeys.c diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c similarity index 100% rename from arch/powerpc/mm/hugetlbpage-radix.c rename to arch/powerpc/mm/book3s64/radix_hugetlbpage.c diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/book3s64/radix_pgtable.c similarity index 96% rename from arch/powerpc/mm/pgtable-radix.c rename to arch/powerpc/mm/book3s64/radix_pgtable.c index fcb0169e2d324..c9bcf428dd2b3 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -681,7 +681,8 @@ void radix__mmu_cleanup_all(void) void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* We don't currently support the first MEMBLOCK not mapping 0 + /* + * We don't currently support the first MEMBLOCK not mapping 0 * physical on those processors */ BUG_ON(first_memblock_base != 0); @@ -1003,45 +1004,44 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - struct list_head *lh = (struct list_head *) pgtable; + struct list_head *lh = (struct list_head *) pgtable; - assert_spin_locked(pmd_lockptr(mm, pmdp)); + assert_spin_locked(pmd_lockptr(mm, pmdp)); - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - pte_t *ptep; - pgtable_t pgtable; - struct list_head *lh; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - *ptep = __pte(0); - ptep++; - *ptep = __pte(0); - return pgtable; -} + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) + unsigned long addr, pmd_t *pmdp) { pmd_t old_pmd; unsigned long old; diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/book3s64/radix_tlb.c similarity index 100% rename from arch/powerpc/mm/tlb-radix.c rename to arch/powerpc/mm/book3s64/radix_tlb.c diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/book3s64/slb.c similarity index 99% rename from arch/powerpc/mm/slb.c rename to arch/powerpc/mm/book3s64/slb.c index 89e4531de64b4..c22742218bd3e 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -554,7 +554,8 @@ void slb_initialize(void) asm volatile("isync; slbia; isync":::"memory"); create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, LINEAR_INDEX); - /* For the boot cpu, we're running on the stack in init_thread_union, + /* + * For the boot cpu, we're running on the stack in init_thread_union, * which is in the first segment of the linear mapping, and also * get_paca()->kstack hasn't been initialized yet. * For secondary cpus, we need to bolt the kernel stack entry now. diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c similarity index 100% rename from arch/powerpc/mm/subpage-prot.c rename to arch/powerpc/mm/book3s64/subpage_prot.c diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/book3s64/vphn.c similarity index 94% rename from arch/powerpc/mm/vphn.c rename to arch/powerpc/mm/book3s64/vphn.c index f83044faac23d..0ee7734afb50b 100644 --- a/arch/powerpc/mm/vphn.c +++ b/arch/powerpc/mm/book3s64/vphn.c @@ -42,7 +42,8 @@ int vphn_unpack_associativity(const long *packed, __be32 *unpacked) u16 new = be16_to_cpup(field++); if (is_32bit) { - /* Let's concatenate the 16 bits of this field to the + /* + * Let's concatenate the 16 bits of this field to the * 15 lower bits of the previous field */ unpacked[++nr_assoc_doms] = @@ -56,7 +57,8 @@ int vphn_unpack_associativity(const long *packed, __be32 *unpacked) unpacked[++nr_assoc_doms] = cpu_to_be32(new & VPHN_FIELD_MASK); } else { - /* Data is in the lower 15 bits of this field + /* + * Data is in the lower 15 bits of this field * concatenated with the next 16 bit field */ last = new; diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/book3s64/vphn.h similarity index 98% rename from arch/powerpc/mm/vphn.h rename to arch/powerpc/mm/book3s64/vphn.h index f9ffdb3942fc9..f0b93c2dd5782 100644 --- a/arch/powerpc/mm/vphn.h +++ b/arch/powerpc/mm/book3s64/vphn.h @@ -2,8 +2,7 @@ #ifndef _ARCH_POWERPC_MM_VPHN_H_ #define _ARCH_POWERPC_MM_VPHN_H_ -/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. - */ +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */ #define VPHN_REGISTER_COUNT 6 /* diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 6ef36d553cdee..57e64273cb33b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1068,7 +1068,7 @@ u64 memory_hotplug_max(void) /* Virtual Processor Home Node (VPHN) support */ #ifdef CONFIG_PPC_SPLPAR -#include "vphn.h" +#include "book3s64/vphn.h" struct topology_update_data { struct topology_update_data *next; diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c index 186b906e66d5a..1d1f5f2be3b29 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.c +++ b/tools/testing/selftests/powerpc/vphn/vphn.c @@ -1 +1 @@ -../../../../../arch/powerpc/mm/vphn.c \ No newline at end of file +../../../../../arch/powerpc/mm/book3s64/vphn.c \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/vphn/vphn.h b/tools/testing/selftests/powerpc/vphn/vphn.h index 7131efe38c65f..45fe160f82881 120000 --- a/tools/testing/selftests/powerpc/vphn/vphn.h +++ b/tools/testing/selftests/powerpc/vphn/vphn.h @@ -1 +1 @@ -../../../../../arch/powerpc/mm/vphn.h \ No newline at end of file +../../../../../arch/powerpc/mm/book3s64/vphn.h \ No newline at end of file From 17312f258cf6eb584f276ad592972ade7e16e318 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:01 +0000 Subject: [PATCH 080/205] powerpc/mm: Move book3s32 specifics in subdirectory mm/book3s64 Several files in arch/powerpc/mm are only for book3S32. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Shorten new filenames] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 3 +-- arch/powerpc/mm/book3s32/Makefile | 3 +++ arch/powerpc/mm/{hash_low_32.S => book3s32/hash_low.S} | 0 arch/powerpc/mm/{ppc_mmu_32.c => book3s32/mmu.c} | 0 .../mm/{mmu_context_hash32.c => book3s32/mmu_context.c} | 0 arch/powerpc/mm/{tlb_hash32.c => book3s32/tlb.c} | 0 6 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/mm/book3s32/Makefile rename arch/powerpc/mm/{hash_low_32.S => book3s32/hash_low.S} (100%) rename arch/powerpc/mm/{ppc_mmu_32.c => book3s32/mmu.c} (100%) rename arch/powerpc/mm/{mmu_context_hash32.c => book3s32/mmu_context.c} (100%) rename arch/powerpc/mm/{tlb_hash32.c => book3s32/tlb.c} (100%) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index a137fdf775e25..68cb1e840b5e9 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -12,11 +12,10 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_PPC_BOOK3S_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o -obj-$(CONFIG_PPC_BOOK3S_32) += tlb_hash32.o obj-$(CONFIG_40x) += 40x_mmu.o obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile new file mode 100644 index 0000000000000..a4e217d0f3b73 --- /dev/null +++ b/arch/powerpc/mm/book3s32/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += mmu.o hash_low.o mmu_context.o tlb.o diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/book3s32/hash_low.S similarity index 100% rename from arch/powerpc/mm/hash_low_32.S rename to arch/powerpc/mm/book3s32/hash_low.S diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/book3s32/mmu.c similarity index 100% rename from arch/powerpc/mm/ppc_mmu_32.c rename to arch/powerpc/mm/book3s32/mmu.c diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/book3s32/mmu_context.c similarity index 100% rename from arch/powerpc/mm/mmu_context_hash32.c rename to arch/powerpc/mm/book3s32/mmu_context.c diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/book3s32/tlb.c similarity index 100% rename from arch/powerpc/mm/tlb_hash32.c rename to arch/powerpc/mm/book3s32/tlb.c From 27e23b5f5f6f22292347901303aab2a1d458bcb5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 29 Mar 2019 10:00:02 +0000 Subject: [PATCH 081/205] powerpc/mm: Move nohash specifics in subdirectory mm/nohash Many files in arch/powerpc/mm are only for nohash. This patch creates a subdirectory for them. Signed-off-by: Christophe Leroy [mpe: Shorten new filenames] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 17 +---------------- arch/powerpc/mm/{40x_mmu.c => nohash/40x.c} | 0 arch/powerpc/mm/{44x_mmu.c => nohash/44x.c} | 0 arch/powerpc/mm/{8xx_mmu.c => nohash/8xx.c} | 0 arch/powerpc/mm/nohash/Makefile | 18 ++++++++++++++++++ .../book3e_hugetlbpage.c} | 0 .../book3e_pgtable.c} | 0 .../mm/{fsl_booke_mmu.c => nohash/fsl_booke.c} | 0 .../mmu_context.c} | 0 arch/powerpc/mm/{tlb_nohash.c => nohash/tlb.c} | 0 .../mm/{tlb_nohash_low.S => nohash/tlb_low.S} | 0 arch/powerpc/mm/{ => nohash}/tlb_low_64e.S | 0 12 files changed, 19 insertions(+), 16 deletions(-) rename arch/powerpc/mm/{40x_mmu.c => nohash/40x.c} (100%) rename arch/powerpc/mm/{44x_mmu.c => nohash/44x.c} (100%) rename arch/powerpc/mm/{8xx_mmu.c => nohash/8xx.c} (100%) create mode 100644 arch/powerpc/mm/nohash/Makefile rename arch/powerpc/mm/{hugetlbpage-book3e.c => nohash/book3e_hugetlbpage.c} (100%) rename arch/powerpc/mm/{pgtable-book3e.c => nohash/book3e_pgtable.c} (100%) rename arch/powerpc/mm/{fsl_booke_mmu.c => nohash/fsl_booke.c} (100%) rename arch/powerpc/mm/{mmu_context_nohash.c => nohash/mmu_context.c} (100%) rename arch/powerpc/mm/{tlb_nohash.c => nohash/tlb.c} (100%) rename arch/powerpc/mm/{tlb_nohash_low.S => nohash/tlb_low.S} (100%) rename arch/powerpc/mm/{ => nohash}/tlb_low_64e.S (100%) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 68cb1e840b5e9..08557bae6fa16 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -8,30 +8,15 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ init-common.o mmu_context.o drmem.o -obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ - tlb_nohash_low.o -obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o -obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o -obj-$(CONFIG_40x) += 40x_mmu.o -obj-$(CONFIG_44x) += 44x_mmu.o -obj-$(CONFIG_PPC_8xx) += 8xx_mmu.o -obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o -ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o -endif obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ - -# Disable kcov instrumentation on sensitive code -# This is necessary for booting with kcov enabled on book3e machines -KCOV_INSTRUMENT_tlb_nohash.o := n -KCOV_INSTRUMENT_fsl_booke_mmu.o := n diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/nohash/40x.c similarity index 100% rename from arch/powerpc/mm/40x_mmu.c rename to arch/powerpc/mm/nohash/40x.c diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/nohash/44x.c similarity index 100% rename from arch/powerpc/mm/44x_mmu.c rename to arch/powerpc/mm/nohash/44x.c diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/nohash/8xx.c similarity index 100% rename from arch/powerpc/mm/8xx_mmu.c rename to arch/powerpc/mm/nohash/8xx.c diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile new file mode 100644 index 0000000000000..b2228ff81b8af --- /dev/null +++ b/arch/powerpc/mm/nohash/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +obj-y += mmu_context.o tlb.o tlb_low.o +obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o +obj-$(CONFIG_40x) += 40x.o +obj-$(CONFIG_44x) += 44x.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o +ifdef CONFIG_HUGETLB_PAGE +obj-$(CONFIG_PPC_BOOK3E_MMU) += book3e_hugetlbpage.o +endif + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb.o := n +KCOV_INSTRUMENT_fsl_booke.o := n diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c similarity index 100% rename from arch/powerpc/mm/hugetlbpage-book3e.c rename to arch/powerpc/mm/nohash/book3e_hugetlbpage.c diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/nohash/book3e_pgtable.c similarity index 100% rename from arch/powerpc/mm/pgtable-book3e.c rename to arch/powerpc/mm/nohash/book3e_pgtable.c diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/nohash/fsl_booke.c similarity index 100% rename from arch/powerpc/mm/fsl_booke_mmu.c rename to arch/powerpc/mm/nohash/fsl_booke.c diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/nohash/mmu_context.c similarity index 100% rename from arch/powerpc/mm/mmu_context_nohash.c rename to arch/powerpc/mm/nohash/mmu_context.c diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/nohash/tlb.c similarity index 100% rename from arch/powerpc/mm/tlb_nohash.c rename to arch/powerpc/mm/nohash/tlb.c diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/nohash/tlb_low.S similarity index 100% rename from arch/powerpc/mm/tlb_nohash_low.S rename to arch/powerpc/mm/nohash/tlb_low.S diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S similarity index 100% rename from arch/powerpc/mm/tlb_low_64e.S rename to arch/powerpc/mm/nohash/tlb_low_64e.S From 5ba666d56c4ff9b011c1b029dcc689cff8b176fb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:27 +0000 Subject: [PATCH 082/205] powerpc/mm: fix erroneous duplicate slb_addr_limit init Commit 67fda38f0d68 ("powerpc/mm: Move slb_addr_linit to early_init_mmu") moved slb_addr_limit init out of setup_arch(). Commit 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix") brought it back into setup_arch() by error. This patch reverts that erroneous regress. Fixes: 701101865f5d ("powerpc/mm: Reduce memory usage for mm_context_t for radix") Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 70dc10aa0ccf3..19d68a9b5f372 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -950,12 +950,6 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; -#ifdef CONFIG_PPC_MM_SLICES -#if defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif -#endif - #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif From 02f89aed6b829d73980bb633d9f4e3de9eb45543 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:28 +0000 Subject: [PATCH 083/205] powerpc/mm: no slice for nohash/64 Only nohash/32 and book3s/64 support mm slices. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/64/slice.h | 12 ------------ arch/powerpc/include/asm/slice.h | 4 +--- arch/powerpc/platforms/Kconfig.cputype | 4 ++++ 3 files changed, 5 insertions(+), 15 deletions(-) delete mode 100644 arch/powerpc/include/asm/nohash/64/slice.h diff --git a/arch/powerpc/include/asm/nohash/64/slice.h b/arch/powerpc/include/asm/nohash/64/slice.h deleted file mode 100644 index ad0d6e3cc1c5c..0000000000000 --- a/arch/powerpc/include/asm/nohash/64/slice.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_64_SLICE_H -#define _ASM_POWERPC_NOHASH_64_SLICE_H - -#ifdef CONFIG_PPC_64K_PAGES -#define get_slice_psize(mm, addr) MMU_PAGE_64K -#else /* CONFIG_PPC_64K_PAGES */ -#define get_slice_psize(mm, addr) MMU_PAGE_4K -#endif /* !CONFIG_PPC_64K_PAGES */ -#define slice_set_user_psize(mm, psize) do { BUG(); } while (0) - -#endif /* _ASM_POWERPC_NOHASH_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index 44816cbc4198e..be8af667098fc 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -4,9 +4,7 @@ #ifdef CONFIG_PPC_BOOK3S_64 #include -#elif defined(CONFIG_PPC64) -#include -#elif defined(CONFIG_PPC_MMU_NOHASH) +#elif defined(CONFIG_PPC_MMU_NOHASH_32) #include #endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 60a7c7095b057..cd28b045c0f3c 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -391,6 +391,10 @@ config PPC_MMU_NOHASH def_bool y depends on !PPC_BOOK3S +config PPC_MMU_NOHASH_32 + def_bool y + depends on PPC_MMU_NOHASH && PPC32 + config PPC_BOOK3E_MMU def_bool y depends on FSL_BOOKE || PPC_BOOK3E From 6f60cc98df2be7f082bd786aa824ceabd24d24cb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:29 +0000 Subject: [PATCH 084/205] powerpc/mm: hand a context_t over to slice_mask_for_size() instead of mm_struct slice_mask_for_size() only uses mm->context, so hand directly a pointer to the context. This will help moving the function in subarch mmu.h in the next patch by avoiding having to include the definition of struct mm_struct Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 35b2780823916..8eb7e8b09c759 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -151,32 +151,32 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, } #ifdef CONFIG_PPC_BOOK3S_64 -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) +static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) - return mm_ctx_slice_mask_64k(&mm->context); + return mm_ctx_slice_mask_64k(&ctx); #endif if (psize == MMU_PAGE_4K) - return mm_ctx_slice_mask_4k(&mm->context); + return mm_ctx_slice_mask_4k(&ctx); #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_16M) - return mm_ctx_slice_mask_16m(&mm->context); + return mm_ctx_slice_mask_16m(&ctx); if (psize == MMU_PAGE_16G) - return mm_ctx_slice_mask_16g(&mm->context); + return mm_ctx_slice_mask_16g(&ctx); #endif BUG(); } #elif defined(CONFIG_PPC_8xx) -static struct slice_mask *slice_mask_for_size(struct mm_struct *mm, int psize) +static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { if (psize == mmu_virtual_psize) - return &mm->context.mask_base_psize; + return &ctx->mask_base_psize; #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_512K) - return &mm->context.mask_512k; + return &ctx->mask_512k; if (psize == MMU_PAGE_8M) - return &mm->context.mask_8m; + return &ctx->mask_8m; #endif BUG(); } @@ -246,7 +246,7 @@ static void slice_convert(struct mm_struct *mm, slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); slice_print_mask(" mask", mask); - psize_mask = slice_mask_for_size(mm, psize); + psize_mask = slice_mask_for_size(&mm->context, psize); /* We need to use a spinlock here to protect against * concurrent 64k -> 4k demotion ... @@ -263,7 +263,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (lpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); old_mask->low_slices &= ~(1u << i); psize_mask->low_slices |= 1u << i; @@ -282,7 +282,7 @@ static void slice_convert(struct mm_struct *mm, /* Update the slice_mask */ old_psize = (hpsizes[index] >> (mask_index * 4)) & 0xf; - old_mask = slice_mask_for_size(mm, old_psize); + old_mask = slice_mask_for_size(&mm->context, old_psize); __clear_bit(i, old_mask->high_slices); __set_bit(i, psize_mask->high_slices); @@ -538,7 +538,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - maskp = slice_mask_for_size(mm, psize); + maskp = slice_mask_for_size(&mm->context, psize); /* * Here "good" means slices that are already the right page size, @@ -565,7 +565,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * a pointer to good mask for the next code to use. */ if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); if (fixed) slice_or_mask(&good_mask, maskp, compat_maskp); else @@ -760,7 +760,7 @@ void slice_init_new_context_exec(struct mm_struct *mm) /* * Slice mask cache starts zeroed, fill the default size cache. */ - mask = slice_mask_for_size(mm, psize); + mask = slice_mask_for_size(&mm->context, psize); mask->low_slices = ~0UL; if (SLICE_NUM_HIGH) bitmap_fill(mask->high_slices, SLICE_NUM_HIGH); @@ -819,14 +819,14 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(radix_enabled()); - maskp = slice_mask_for_size(mm, psize); + maskp = slice_mask_for_size(&mm->context, psize); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { const struct slice_mask *compat_maskp; struct slice_mask available; - compat_maskp = slice_mask_for_size(mm, MMU_PAGE_4K); + compat_maskp = slice_mask_for_size(&mm->context, MMU_PAGE_4K); slice_or_mask(&available, maskp, compat_maskp); return !slice_check_range_fits(mm, &available, addr, len); } From fca5c1e9eb5e263c1b4def0b5ae4ce5b2e1a9877 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:30 +0000 Subject: [PATCH 085/205] powerpc/mm: move slice_mask_for_size() into mmu.h Move slice_mask_for_size() into subarch mmu.h Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V [mpe: Retain the BUG_ON()s, rather than converting to VM_BUG_ON()] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 17 ++++++++ arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 42 ++++++++++++++------ arch/powerpc/mm/slice.c | 34 ---------------- 3 files changed, 46 insertions(+), 47 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 230a9dec76774..a6d5b5ed11709 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -203,6 +203,23 @@ static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) } #endif +static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) +{ +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + return mm_ctx_slice_mask_64k(&ctx); +#endif +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_16M) + return mm_ctx_slice_mask_16m(&ctx); + if (psize == MMU_PAGE_16G) + return mm_ctx_slice_mask_16g(&ctx); +#endif + BUG_ON(psize != MMU_PAGE_4K); + + return mm_ctx_slice_mask_4k(&ctx); +} + #ifdef CONFIG_PPC_SUBPAGE_PROT static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx) { diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index c503e2f05e611..114f50d995dc8 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -184,7 +184,23 @@ #define LOW_SLICE_ARRAY_SZ SLICE_ARRAY_SIZE #endif +#if defined(CONFIG_PPC_4K_PAGES) +#define mmu_virtual_psize MMU_PAGE_4K +#elif defined(CONFIG_PPC_16K_PAGES) +#define mmu_virtual_psize MMU_PAGE_16K +#define PTE_FRAG_NR 4 +#define PTE_FRAG_SIZE_SHIFT 12 +#define PTE_FRAG_SIZE (1UL << 12) +#else +#error "Unsupported PAGE_SIZE" +#endif + +#define mmu_linear_psize MMU_PAGE_8M + #ifndef __ASSEMBLY__ + +#include + struct slice_mask { u64 low_slices; DECLARE_BITMAP(high_slices, 0); @@ -255,6 +271,19 @@ static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx) return &ctx->mask_8m; } #endif + +static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_512K) + return &ctx->mask_512k; + if (psize == MMU_PAGE_8M) + return &ctx->mask_8m; +#endif + BUG_ON(psize != mmu_virtual_psize); + + return &ctx->mask_base_psize; +} #endif /* CONFIG_PPC_MM_SLICE */ #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) @@ -306,17 +335,4 @@ extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf; #endif /* !__ASSEMBLY__ */ -#if defined(CONFIG_PPC_4K_PAGES) -#define mmu_virtual_psize MMU_PAGE_4K -#elif defined(CONFIG_PPC_16K_PAGES) -#define mmu_virtual_psize MMU_PAGE_16K -#define PTE_FRAG_NR 4 -#define PTE_FRAG_SIZE_SHIFT 12 -#define PTE_FRAG_SIZE (1UL << 12) -#else -#error "Unsupported PAGE_SIZE" -#endif - -#define mmu_linear_psize MMU_PAGE_8M - #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 8eb7e8b09c759..31de91b65a646 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -150,40 +150,6 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret, __set_bit(i, ret->high_slices); } -#ifdef CONFIG_PPC_BOOK3S_64 -static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) -{ -#ifdef CONFIG_PPC_64K_PAGES - if (psize == MMU_PAGE_64K) - return mm_ctx_slice_mask_64k(&ctx); -#endif - if (psize == MMU_PAGE_4K) - return mm_ctx_slice_mask_4k(&ctx); -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_16M) - return mm_ctx_slice_mask_16m(&ctx); - if (psize == MMU_PAGE_16G) - return mm_ctx_slice_mask_16g(&ctx); -#endif - BUG(); -} -#elif defined(CONFIG_PPC_8xx) -static struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) -{ - if (psize == mmu_virtual_psize) - return &ctx->mask_base_psize; -#ifdef CONFIG_HUGETLB_PAGE - if (psize == MMU_PAGE_512K) - return &ctx->mask_512k; - if (psize == MMU_PAGE_8M) - return &ctx->mask_8m; -#endif - BUG(); -} -#else -#error "Must define the slice masks for page sizes supported by the platform" -#endif - static bool slice_check_range_fits(struct mm_struct *mm, const struct slice_mask *available, unsigned long start, unsigned long len) From 877461210ea1c92f159bf261924e58d7d27edadc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:31 +0000 Subject: [PATCH 086/205] powerpc/mm: get rid of mm_ctx_slice_mask_xxx() Now that slice_mask_for_size() is in mmu.h, the mm_ctx_slice_mask_xxx() are not needed anymore, so drop them. Note that the 8xx ones where not used anyway. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu.h | 32 +++----------------- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 17 ----------- 2 files changed, 4 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index a6d5b5ed11709..51b2d60efc1ba 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -179,45 +179,21 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li ctx->hash_context->slb_addr_limit = limit; } -#ifdef CONFIG_PPC_64K_PAGES -static inline struct slice_mask *mm_ctx_slice_mask_64k(mm_context_t *ctx) -{ - return &ctx->hash_context->mask_64k; -} -#endif - -static inline struct slice_mask *mm_ctx_slice_mask_4k(mm_context_t *ctx) -{ - return &ctx->hash_context->mask_4k; -} - -#ifdef CONFIG_HUGETLB_PAGE -static inline struct slice_mask *mm_ctx_slice_mask_16m(mm_context_t *ctx) -{ - return &ctx->hash_context->mask_16m; -} - -static inline struct slice_mask *mm_ctx_slice_mask_16g(mm_context_t *ctx) -{ - return &ctx->hash_context->mask_16g; -} -#endif - static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { #ifdef CONFIG_PPC_64K_PAGES if (psize == MMU_PAGE_64K) - return mm_ctx_slice_mask_64k(&ctx); + return &ctx->hash_context->mask_64k; #endif #ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_16M) - return mm_ctx_slice_mask_16m(&ctx); + return &ctx->hash_context->mask_16m; if (psize == MMU_PAGE_16G) - return mm_ctx_slice_mask_16g(&ctx); + return &ctx->hash_context->mask_16g; #endif BUG_ON(psize != MMU_PAGE_4K); - return mm_ctx_slice_mask_4k(&ctx); + return &ctx->hash_context->mask_4k; } #ifdef CONFIG_PPC_SUBPAGE_PROT diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 114f50d995dc8..77ccf7cb6fcc9 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -255,23 +255,6 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li ctx->slb_addr_limit = limit; } -static inline struct slice_mask *mm_ctx_slice_mask_base(mm_context_t *ctx) -{ - return &ctx->mask_base_psize; -} - -#ifdef CONFIG_HUGETLB_PAGE -static inline struct slice_mask *mm_ctx_slice_mask_512k(mm_context_t *ctx) -{ - return &ctx->mask_512k; -} - -static inline struct slice_mask *mm_ctx_slice_mask_8m(mm_context_t *ctx) -{ - return &ctx->mask_8m; -} -#endif - static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { #ifdef CONFIG_HUGETLB_PAGE From b4baad0b2712471740c58a1bc9578ab057af7514 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:32 +0000 Subject: [PATCH 087/205] powerpc/mm: remove unnecessary #ifdef CONFIG_PPC64 For PPC32 that's a noop, gcc should be smart enough to ignore it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 31de91b65a646..840c4118a1851 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -118,13 +118,11 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) unsigned long start = slice << SLICE_HIGH_SHIFT; unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); -#ifdef CONFIG_PPC64 /* Hack, so that each addresses is controlled by exactly one * of the high or low area bitmaps, the first high area starts * at 4GB, not 0 */ if (start == 0) - start = SLICE_LOW_TOP; -#endif + start = (unsigned long)SLICE_LOW_TOP; return !slice_area_is_free(mm, start, end - start); } From 203a1fa6286671900698485ddffbb435901aa75b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:33 +0000 Subject: [PATCH 088/205] powerpc/mm: remove a couple of #ifdef CONFIG_PPC_64K_PAGES in mm/slice.c This patch replaces a couple of #ifdef CONFIG_PPC_64K_PAGES by IS_ENABLED(CONFIG_PPC_64K_PAGES) to improve code maintainability. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 840c4118a1851..ace97d9530403 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -606,14 +606,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); -#ifdef CONFIG_PPC_64K_PAGES - if (newaddr == -ENOMEM && psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && newaddr == -ENOMEM && + psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ slice_or_mask(&potential_mask, &potential_mask, compat_maskp); newaddr = slice_find_area(mm, len, &potential_mask, psize, topdown, high_limit); } -#endif if (newaddr == -ENOMEM) return -ENOMEM; @@ -784,9 +783,9 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(radix_enabled()); maskp = slice_mask_for_size(&mm->context, psize); -#ifdef CONFIG_PPC_64K_PAGES + /* We need to account for 4k slices too */ - if (psize == MMU_PAGE_64K) { + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && psize == MMU_PAGE_64K) { const struct slice_mask *compat_maskp; struct slice_mask available; @@ -794,7 +793,6 @@ int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, slice_or_mask(&available, maskp, compat_maskp); return !slice_check_range_fits(mm, &available, addr, len); } -#endif return !slice_check_range_fits(mm, maskp, addr, len); } From 33f128c64919736164e70eb024da3ae5e5768cd6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:34 +0000 Subject: [PATCH 089/205] powerpc/8xx: get rid of #ifdef CONFIG_HUGETLB_PAGE for slices The 8xx only selects CONFIG_PPC_MM_SLICES when CONFIG_HUGETLB_PAGE is set. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 77ccf7cb6fcc9..76af5b0cb16ec 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -216,10 +216,8 @@ typedef struct { unsigned char high_slices_psize[0]; unsigned long slb_addr_limit; struct slice_mask mask_base_psize; /* 4k or 16k */ -# ifdef CONFIG_HUGETLB_PAGE struct slice_mask mask_512k; struct slice_mask mask_8m; -# endif #endif void *pte_frag; } mm_context_t; @@ -257,12 +255,11 @@ static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long li static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize) { -#ifdef CONFIG_HUGETLB_PAGE if (psize == MMU_PAGE_512K) return &ctx->mask_512k; if (psize == MMU_PAGE_8M) return &ctx->mask_8m; -#endif + BUG_ON(psize != mmu_virtual_psize); return &ctx->mask_base_psize; From 43ed7909d70a61c621cadb5d808dc392ad537e5a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:35 +0000 Subject: [PATCH 090/205] powerpc/mm: define get_slice_psize() all the time get_slice_psize() can be defined regardless of CONFIG_PPC_MM_SLICES to avoid ifdefs Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/slice.h | 5 +++++ arch/powerpc/mm/hugetlbpage.c | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/slice.h b/arch/powerpc/include/asm/slice.h index be8af667098fc..c6f466f4c2410 100644 --- a/arch/powerpc/include/asm/slice.h +++ b/arch/powerpc/include/asm/slice.h @@ -36,6 +36,11 @@ void slice_setup_new_exec(void); static inline void slice_init_new_context_exec(struct mm_struct *mm) {} +static inline unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + #endif /* CONFIG_PPC_MM_SLICES */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9e732bb2c84ae..5f67e7a4d1cc6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -578,14 +578,12 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { -#ifdef CONFIG_PPC_MM_SLICES /* With radix we don't use slice, so derive it from vma*/ - if (!radix_enabled()) { + if (IS_ENABLED(CONFIG_PPC_MM_SLICES) && !radix_enabled()) { unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); return 1UL << mmu_psize_to_shift(psize); } -#endif return vma_kernel_pagesize(vma); } From 5953fb4f4671d7d755a81017a76766c00922d059 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Apr 2019 14:29:36 +0000 Subject: [PATCH 091/205] powerpc/mm: define subarch SLB_ADDR_LIMIT_DEFAULT This patch defines a subarch specific SLB_ADDR_LIMIT_DEFAULT to remove the #ifdefs around the setup of mm->context.slb_addr_limit It also generalises the use of mm_ctx_set_slb_addr_limit() helper. Signed-off-by: Christophe Leroy Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/slice.h | 2 ++ arch/powerpc/include/asm/nohash/32/slice.h | 2 ++ arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 +--- arch/powerpc/mm/slice.c | 6 +----- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/slice.h b/arch/powerpc/include/asm/book3s/64/slice.h index 062e11136e9c4..f0d3194ba41bd 100644 --- a/arch/powerpc/include/asm/book3s/64/slice.h +++ b/arch/powerpc/include/asm/book3s/64/slice.h @@ -11,4 +11,6 @@ #define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) +#define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW_USER64 + #endif /* _ASM_POWERPC_BOOK3S_64_SLICE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/slice.h b/arch/powerpc/include/asm/nohash/32/slice.h index 777d62e40ac04..39eb0154ae2d2 100644 --- a/arch/powerpc/include/asm/nohash/32/slice.h +++ b/arch/powerpc/include/asm/nohash/32/slice.h @@ -13,6 +13,8 @@ #define SLICE_NUM_HIGH 0ul #define GET_HIGH_SLICE_INDEX(addr) (addr & 0) +#define SLB_ADDR_LIMIT_DEFAULT DEFAULT_MAP_WINDOW + #endif /* CONFIG_PPC_MM_SLICES */ #endif /* _ASM_POWERPC_NOHASH_32_SLICE_H */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b21a81d42f152..23ed8db645adf 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1058,7 +1058,7 @@ void __init hash__early_init_mmu(void) htab_initialize(); init_mm.context.hash_context = &init_hash_mm_context; - init_mm.context.hash_context->slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); pr_info("Initializing hash mmu with SLB\n"); /* Initialize SLB management */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 704e613a0b144..c2494b8380089 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -802,9 +802,7 @@ void __init early_init_mmu(void) #endif #ifdef CONFIG_PPC_MM_SLICES -#if defined(CONFIG_PPC_8xx) - init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif + mm_ctx_set_slb_addr_limit(&init_mm.context, SLB_ADDR_LIMIT_DEFAULT); #endif } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index ace97d9530403..97fbf7b544221 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -704,11 +704,7 @@ void slice_init_new_context_exec(struct mm_struct *mm) * case of fork it is just inherited from the mm being * duplicated. */ -#ifdef CONFIG_PPC64 - mm_ctx_set_slb_addr_limit(&mm->context, DEFAULT_MAP_WINDOW_USER64); -#else - mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW; -#endif + mm_ctx_set_slb_addr_limit(&mm->context, SLB_ADDR_LIMIT_DEFAULT); mm_ctx_set_user_psize(&mm->context, psize); /* From a521c44c3ded9fe184c5de3eed3a442af2d26f00 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:38 +0000 Subject: [PATCH 092/205] powerpc/book3e: drop mmu_get_tsize() This function is not used anymore, drop it. Fixes: b42279f0165c ("powerpc/mm/nohash: MM_SLICE is only used by book3s 64") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index f84ec46cdb26f..c911fe9bfa0ed 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -49,11 +49,6 @@ static inline int tlb1_next(void) #endif /* !PPC64 */ #endif /* FSL */ -static inline int mmu_get_tsize(int psize) -{ - return mmu_psize_defs[psize].enc; -} - #if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) #include From 5874cabe29079b72b192a28d266adf1a460fc5d6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:39 +0000 Subject: [PATCH 093/205] powerpc/64: only book3s/64 supports CONFIG_PPC_64K_PAGES CONFIG_PPC_64K_PAGES cannot be selected by nohash/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 - arch/powerpc/include/asm/nohash/64/pgalloc.h | 3 -- arch/powerpc/include/asm/nohash/64/pgtable.h | 4 --- arch/powerpc/include/asm/nohash/pte-book3e.h | 5 ---- arch/powerpc/include/asm/pgtable-be-types.h | 9 ++---- arch/powerpc/include/asm/pgtable-types.h | 9 ++---- arch/powerpc/include/asm/task_size_64.h | 2 +- arch/powerpc/mm/nohash/tlb.c | 13 -------- arch/powerpc/mm/nohash/tlb_low_64e.S | 31 -------------------- 9 files changed, 5 insertions(+), 72 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82c30619..5d8e692d64703 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -375,7 +375,6 @@ config ZONE_DMA config PGTABLE_LEVELS int default 2 if !PPC64 - default 3 if PPC_64K_PAGES && !PPC_BOOK3S_64 default 4 source "arch/powerpc/sysdev/Kconfig" diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 66d086f85bd5b..ded453f9b5a8a 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -171,12 +171,9 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#endif /* CONFIG_PPC_64K_PAGES */ - #define check_pgt_cache() do { } while (0) #endif /* _ASM_POWERPC_PGALLOC_64_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index c8e6a9a5bc33f..b9f66cf15c310 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -10,10 +10,6 @@ #include #include -#ifdef CONFIG_PPC_64K_PAGES -#error "Page size not supported" -#endif - #define FIRST_USER_ADDRESS 0UL /* diff --git a/arch/powerpc/include/asm/nohash/pte-book3e.h b/arch/powerpc/include/asm/nohash/pte-book3e.h index dd40d200f2742..813918f407653 100644 --- a/arch/powerpc/include/asm/nohash/pte-book3e.h +++ b/arch/powerpc/include/asm/nohash/pte-book3e.h @@ -60,13 +60,8 @@ #define _PAGE_SPECIAL _PAGE_SW0 /* Base page size */ -#ifdef CONFIG_PPC_64K_PAGES -#define _PAGE_PSIZE _PAGE_PSIZE_64K -#define PTE_RPN_SHIFT (28) -#else #define _PAGE_PSIZE _PAGE_PSIZE_4K #define PTE_RPN_SHIFT (24) -#endif #define PTE_WIMGE_SHIFT (19) #define PTE_BAP_SHIFT (2) diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index a89c67b626801..b169bbf95fcbe 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -33,11 +33,7 @@ static inline __be64 pmd_raw(pmd_t x) return x.pmd; } -/* - * 64 bit hash always use 4 level table. Everybody else use 4 level - * only for 4K page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +/* 64 bit always use 4 level table. */ typedef struct { __be64 pud; } pud_t; #define __pud(x) ((pud_t) { cpu_to_be64(x) }) #define __pud_raw(x) ((pud_t) { (x) }) @@ -51,7 +47,6 @@ static inline __be64 pud_raw(pud_t x) return x.pud; } -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ #endif /* CONFIG_PPC64 */ /* PGD level */ @@ -77,7 +72,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) +#ifdef CONFIG_PPC_64K_PAGES typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 3b0edf041b2e1..d11b4c61d686a 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -23,18 +23,13 @@ static inline unsigned long pmd_val(pmd_t x) return x.pmd; } -/* - * 64 bit hash always use 4 level table. Everybody else use 4 level - * only for 4K page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +/* 64 bit always use 4 level table. */ typedef struct { unsigned long pud; } pud_t; #define __pud(x) ((pud_t) { (x) }) static inline unsigned long pud_val(pud_t x) { return x.pud; } -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ #endif /* CONFIG_PPC64 */ /* PGD level */ @@ -54,7 +49,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) +#ifdef CONFIG_PPC_64K_PAGES typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h index eab4779f6b849..c993482237edc 100644 --- a/arch/powerpc/include/asm/task_size_64.h +++ b/arch/powerpc/include/asm/task_size_64.h @@ -20,7 +20,7 @@ /* * For now 512TB is only supported with book3s and 64K linux page size. */ -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) +#ifdef CONFIG_PPC_64K_PAGES /* * Max value currently used: */ diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index c2494b8380089..24f88efb05bf2 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -433,11 +433,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) unsigned long rid = (address & rmask) | 0x1000000000000000ul; unsigned long vpte = address & ~rmask; -#ifdef CONFIG_PPC_64K_PAGES - vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; -#else vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; -#endif vpte |= rid; __flush_tlb_page(tlb->mm, vpte, tsize, 0); } @@ -625,21 +621,12 @@ static void early_init_this_mmu(void) case PPC_HTW_IBM: mas4 |= MAS4_INDD; -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; - mmu_pte_psize = MMU_PAGE_256M; -#else mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; mmu_pte_psize = MMU_PAGE_1M; -#endif break; case PPC_HTW_NONE: -#ifdef CONFIG_PPC_64K_PAGES - mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; -#else mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; -#endif mmu_pte_psize = mmu_virtual_psize; break; } diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S index 9ed90064f5429..58959ce15415c 100644 --- a/arch/powerpc/mm/nohash/tlb_low_64e.S +++ b/arch/powerpc/mm/nohash/tlb_low_64e.S @@ -24,11 +24,7 @@ #include #include -#ifdef CONFIG_PPC_64K_PAGES -#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) -#else #define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) -#endif #define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) #define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) #define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) @@ -167,13 +163,11 @@ MMU_FTR_SECTION_ELSE ldx r14,r14,r15 /* grab pgd entry */ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) -#ifndef CONFIG_PPC_64K_PAGES rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 clrrdi r15,r15,3 cmpdi cr0,r14,0 bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ ldx r14,r14,r15 /* grab pud entry */ -#endif /* CONFIG_PPC_64K_PAGES */ rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 clrrdi r15,r15,3 @@ -682,18 +676,7 @@ normal_tlb_miss: * order to handle the weird page table format used by linux */ ori r10,r15,0x1 -#ifdef CONFIG_PPC_64K_PAGES - /* For the top bits, 16 bytes per PTE */ - rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 - /* Now create the bottom bits as 0 in position 0x8000 and - * the rest calculated for 8 bytes per PTE - */ - rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 - /* Insert the bottom bits in */ - rlwimi r14,r15,0,16,31 -#else rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 -#endif sldi r15,r10,60 clrrdi r14,r14,3 or r10,r15,r14 @@ -732,11 +715,7 @@ finish_normal_tlb_miss: /* Check page size, if not standard, update MAS1 */ rldicl r11,r14,64-8,64-8 -#ifdef CONFIG_PPC_64K_PAGES - cmpldi cr0,r11,BOOK3E_PAGESZ_64K -#else cmpldi cr0,r11,BOOK3E_PAGESZ_4K -#endif beq- 1f mfspr r11,SPRN_MAS1 rlwimi r11,r14,31,21,24 @@ -857,14 +836,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge virt_page_table_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 @@ -1106,14 +1083,12 @@ htw_tlb_miss: cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#ifndef CONFIG_PPC_64K_PAGES /* Get to PUD entry */ rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 clrrdi r10,r11,3 ldx r15,r10,r15 cmpdi cr0,r15,0 bge htw_tlb_miss_fault -#endif /* CONFIG_PPC_64K_PAGES */ /* Get to PMD entry */ rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 @@ -1132,9 +1107,7 @@ htw_tlb_miss: * 4K page we need to extract a bit from the virtual address and * insert it into the "PA52" bit of the RPN. */ -#ifndef CONFIG_PPC_64K_PAGES rlwimi r15,r16,32-9,20,20 -#endif /* Now we build the MAS: * * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG @@ -1144,11 +1117,7 @@ htw_tlb_miss: * MAS 2 : Use defaults * MAS 3+7 : Needs to be done */ -#ifdef CONFIG_PPC_64K_PAGES - ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) -#else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) -#endif BEGIN_MMU_FTR_SECTION srdi r16,r10,32 From 3dea7332ccac1f701a6f8cd4fe44faa9be2e6014 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:40 +0000 Subject: [PATCH 094/205] powerpc/book3e: hugetlbpage is only for CONFIG_PPC_FSL_BOOK3E As per Kconfig.cputype, only CONFIG_PPC_FSL_BOOK3E gets to select SYS_SUPPORTS_HUGETLBFS so simplify accordingly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/Makefile | 2 +- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 47 +++++++++------------ 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index b2228ff81b8af..33b6f6f29d3f2 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_44x) += 44x.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke.o ifdef CONFIG_HUGETLB_PAGE -obj-$(CONFIG_PPC_BOOK3E_MMU) += book3e_hugetlbpage.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += book3e_hugetlbpage.o endif # Disable kcov instrumentation on sensitive code diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index c911fe9bfa0ed..61915f4d3c7f0 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -11,8 +11,9 @@ #include -#ifdef CONFIG_PPC_FSL_BOOK3E #ifdef CONFIG_PPC64 +#include + static inline int tlb1_next(void) { struct paca_struct *paca = get_paca(); @@ -29,28 +30,6 @@ static inline int tlb1_next(void) tcd->esel_next = next; return this; } -#else -static inline int tlb1_next(void) -{ - int index, ncams; - - ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; - - index = this_cpu_read(next_tlbcam_idx); - - /* Just round-robin the entries and wrap when we hit the end */ - if (unlikely(index == ncams - 1)) - __this_cpu_write(next_tlbcam_idx, tlbcam_index); - else - __this_cpu_inc(next_tlbcam_idx); - - return index; -} -#endif /* !PPC64 */ -#endif /* FSL */ - -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_PPC64) -#include static inline void book3e_tlb_lock(void) { @@ -93,6 +72,23 @@ static inline void book3e_tlb_unlock(void) paca->tcd_ptr->lock = 0; } #else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} + static inline void book3e_tlb_lock(void) { } @@ -134,10 +130,7 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, unsigned long psize, tsize, shift; unsigned long flags; struct mm_struct *mm; - -#ifdef CONFIG_PPC_FSL_BOOK3E int index; -#endif if (unlikely(is_kernel_addr(ea))) return; @@ -161,11 +154,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, return; } -#ifdef CONFIG_PPC_FSL_BOOK3E /* We have to use the CAM(TLB1) on FSL parts for hugepages */ index = tlb1_next(); mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); -#endif mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); mas2 = ea & ~((1UL << shift) - 1); From 0caed4de502c7699b7faeaea0a93b39e4f19e11a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:41 +0000 Subject: [PATCH 095/205] powerpc/mm: move __find_linux_pte() out of hugetlbpage.c __find_linux_pte() is the only function in hugetlbpage.c which is compiled in regardless on CONFIG_HUGETLBPAGE This patch moves it in pgtable.c. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 103 --------------------------------- arch/powerpc/mm/pgtable.c | 104 ++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 103 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 5f67e7a4d1cc6..17915fc389ff1 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -756,109 +756,6 @@ void flush_dcache_icache_hugepage(struct page *page) #endif /* CONFIG_HUGETLB_PAGE */ -/* - * We have 4 cases for pgds and pmds: - * (1) invalid (all zeroes) - * (2) pointer to next table, as normal; bottom 6 bits == 0 - * (3) leaf pte for huge page _PAGE_PTE set - * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table - * - * So long as we atomically load page table pointers we are safe against teardown, - * we can follow the address down to the the page and take a ref on it. - * This function need to be called with interrupts disabled. We use this variant - * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED - */ -pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *hpage_shift) -{ - pgd_t pgd, *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t *ret_pte; - hugepd_t *hpdp = NULL; - unsigned pdshift = PGDIR_SHIFT; - - if (hpage_shift) - *hpage_shift = 0; - - if (is_thp) - *is_thp = false; - - pgdp = pgdir + pgd_index(ea); - pgd = READ_ONCE(*pgdp); - /* - * Always operate on the local stack value. This make sure the - * value don't get updated by a parallel THP split/collapse, - * page fault or a page unmap. The return pte_t * is still not - * stable. So should be checked there for above conditions. - */ - if (pgd_none(pgd)) - return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; - goto out; - } else if (is_hugepd(__hugepd(pgd_val(pgd)))) - hpdp = (hugepd_t *)&pgd; - else { - /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled - */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = READ_ONCE(*pudp); - - if (pud_none(pud)) - return NULL; - else if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } else if (is_hugepd(__hugepd(pud_val(pud)))) - hpdp = (hugepd_t *)&pud; - else { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *) pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } else if (is_hugepd(__hugepd(pmd_val(pmd)))) - hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); - } - } - if (!hpdp) - return NULL; - - ret_pte = hugepte_offset(*hpdp, ea, pdshift); - pdshift = hugepd_shift(*hpdp); -out: - if (hpage_shift) - *hpage_shift = pdshift; - return ret_pte; -} -EXPORT_SYMBOL_GPL(__find_linux_pte); - int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d3d61d29b4f13..9f4ccd15849f0 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,7 @@ #include #include #include +#include static inline int is_exec_fault(void) { @@ -299,3 +300,106 @@ unsigned long vmalloc_to_phys(void *va) return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); } EXPORT_SYMBOL_GPL(vmalloc_to_phys); + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page _PAGE_PTE set + * (4) hugepd pointer, _PAGE_PTE = 0 and bits [2..6] indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->irq_soft_mask = IRQS_ENABLED + */ +pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, + bool *is_thp, unsigned *hpage_shift) +{ + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (hpage_shift) + *hpage_shift = 0; + + if (is_thp) + *is_thp = false; + + pgdp = pgdir + pgd_index(ea); + pgd = READ_ONCE(*pgdp); + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + */ + if (pgd_none(pgd)) + return NULL; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; + goto out; + } else if (is_hugepd(__hugepd(pgd_val(pgd)))) + hpdp = (hugepd_t *)&pgd; + else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; + goto out; + } else if (is_hugepd(__hugepd(pud_val(pud)))) + hpdp = (hugepd_t *)&pud; + else { + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *) pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; + goto out; + } else if (is_hugepd(__hugepd(pmd_val(pmd)))) + hpdp = (hugepd_t *)&pmd; + else + return pte_offset_kernel(&pmd, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(*hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (hpage_shift) + *hpage_shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(__find_linux_pte); From b7dcf96ce03e2cab7eb6cda2ca8c66e1529e9bc3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:42 +0000 Subject: [PATCH 096/205] powerpc/mm: make hugetlbpage.c depend on CONFIG_HUGETLB_PAGE The only function in hugetlbpage.c which doesn't depend on CONFIG_HUGETLB_PAGE is gup_hugepte(), and this function is only called from gup_huge_pd() which depends on CONFIG_HUGETLB_PAGE so all the content of hugetlbpage.c depends on CONFIG_HUGETLB_PAGE. This patch modifies Makefile to only compile hugetlbpage.c when CONFIG_HUGETLB_PAGE is set. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/hugetlbpage.c | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 08557bae6fa16..3daea8da0c7fd 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_PPC32) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-y += hugetlbpage.o +obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 17915fc389ff1..9f69594f5d09e 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -26,9 +26,6 @@ #include #include - -#ifdef CONFIG_HUGETLB_PAGE - #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_512K 19 #define PAGE_SHIFT_8M 23 @@ -754,8 +751,6 @@ void flush_dcache_icache_hugepage(struct page *page) } } -#endif /* CONFIG_HUGETLB_PAGE */ - int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { From 0001e5aa5c028c11570f2e641f0198287f4808ba Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:43 +0000 Subject: [PATCH 097/205] powerpc/mm: make gup_hugepte() static gup_huge_pd() is the only user of gup_hugepte() and it is located in the same file. This patch moves gup_huge_pd() after gup_hugepte() and makes gup_hugepte() static. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pgtable.h | 3 --- arch/powerpc/mm/hugetlbpage.c | 38 +++++++++++++++--------------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 505550fb29356..c51846da41a77 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -89,9 +89,6 @@ extern void paging_init(void); */ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); -extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, - struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9f69594f5d09e..95cc9f3d97e25 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -539,23 +539,6 @@ static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, return (__boundary - 1 < end - 1) ? __boundary : end; } -int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, - unsigned long end, int write, struct page **pages, int *nr) -{ - pte_t *ptep; - unsigned long sz = 1UL << hugepd_shift(hugepd); - unsigned long next; - - ptep = hugepte_offset(hugepd, addr, pdshift); - do { - next = hugepte_addr_end(addr, end, sz); - if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) - return 0; - } while (ptep++, addr = next, addr != end); - - return 1; -} - #ifdef CONFIG_PPC_MM_SLICES unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -751,8 +734,8 @@ void flush_dcache_icache_hugepage(struct page *page) } } -int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) { unsigned long pte_end; struct page *head, *page; @@ -798,3 +781,20 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 1; } + +int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned int pdshift, + unsigned long end, int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} From 8197af22be01e7c9ab476138652e0dc8cd22a207 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:44 +0000 Subject: [PATCH 098/205] powerpc/mm: split asm/hugetlb.h into dedicated subarch files Three subarches support hugepages: - fsl book3e - book3s/64 - 8xx This patch splits asm/hugetlb.h to reduce the #ifdef mess. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 40 +++++++++ arch/powerpc/include/asm/hugetlb.h | 87 +------------------ .../include/asm/nohash/32/hugetlb-8xx.h | 31 +++++++ .../include/asm/nohash/hugetlb-book3e.h | 31 +++++++ 4 files changed, 106 insertions(+), 83 deletions(-) create mode 100644 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h create mode 100644 arch/powerpc/include/asm/nohash/hugetlb-book3e.h diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index ec2a55a553c75..cbc8153d6e0ef 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -62,4 +62,44 @@ extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); +/* + * This should work for other subarchs too. But right now we use the + * new format only for 64bit book3s + */ +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + BUG_ON(!hugepd_ok(hpd)); + /* + * We have only four bits to encode, MMU page size + */ + BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf); + return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK); +} + +static inline unsigned int hugepd_mmu_psize(hugepd_t hpd) +{ + return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2; +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); +} +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_hugetlb_page(vma, vmaddr); +} + +static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, + unsigned int pdshift) +{ + unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd); + + return hugepd_page(hpd) + idx; +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); + #endif diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 8d40565ad0c39..fd5c0873a57d8 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -6,83 +6,13 @@ #include #ifdef CONFIG_PPC_BOOK3S_64 - #include -/* - * This should work for other subarchs too. But right now we use the - * new format only for 64bit book3s - */ -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); - /* - * We have only four bits to encode, MMU page size - */ - BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf); - return __va(hpd_val(hpd) & HUGEPD_ADDR_MASK); -} - -static inline unsigned int hugepd_mmu_psize(hugepd_t hpd) -{ - return (hpd_val(hpd) & HUGEPD_SHIFT_MASK) >> 2; -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ - return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); -} -static inline void flush_hugetlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - if (radix_enabled()) - return radix__flush_hugetlb_page(vma, vmaddr); -} - -#else - -static inline pte_t *hugepd_page(hugepd_t hpd) -{ - BUG_ON(!hugepd_ok(hpd)); -#ifdef CONFIG_PPC_8xx - return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK); -#else - return (pte_t *)((hpd_val(hpd) & - ~HUGEPD_SHIFT_MASK) | PD_HUGE); -#endif -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ -#ifdef CONFIG_PPC_8xx - return ((hpd_val(hpd) & _PMD_PAGE_MASK) >> 1) + 17; -#else - return hpd_val(hpd) & HUGEPD_SHIFT_MASK; -#endif -} - +#elif defined(CONFIG_PPC_FSL_BOOK3E) +#include +#elif defined(CONFIG_PPC_8xx) +#include #endif /* CONFIG_PPC_BOOK3S_64 */ - -static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, - unsigned pdshift) -{ - /* - * On FSL BookE, we have multiple higher-level table entries that - * point to the same hugepte. Just use the first one since they're all - * identical. So for that case, idx=0. - */ - unsigned long idx = 0; - - pte_t *dir = hugepd_page(hpd); -#ifdef CONFIG_PPC_8xx - idx = (addr & ((1UL << pdshift) - 1)) >> PAGE_SHIFT; -#elif !defined(CONFIG_PPC_FSL_BOOK3E) - idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd); -#endif - - return dir + idx; -} - void flush_dcache_icache_hugepage(struct page *page); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, @@ -99,15 +29,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte); -#ifdef CONFIG_PPC_8xx -static inline void flush_hugetlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - flush_tlb_page(vma, vmaddr); -} -#else -void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -#endif #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h new file mode 100644 index 0000000000000..997f5b3d6b99b --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H +#define _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + BUG_ON(!hugepd_ok(hpd)); + + return (pte_t *)__va(hpd_val(hpd) & ~HUGEPD_SHIFT_MASK); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return ((hpd_val(hpd) & _PMD_PAGE_MASK) >> 1) + 17; +} + +static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, + unsigned int pdshift) +{ + unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> PAGE_SHIFT; + + return hugepd_page(hpd) + idx; +} + +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + flush_tlb_page(vma, vmaddr); +} + +#endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h new file mode 100644 index 0000000000000..e94f1cd048ee5 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H +#define _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H + +static inline pte_t *hugepd_page(hugepd_t hpd) +{ + if (WARN_ON(!hugepd_ok(hpd))) + return NULL; + + return (pte_t *)((hpd_val(hpd) & ~HUGEPD_SHIFT_MASK) | PD_HUGE); +} + +static inline unsigned int hugepd_shift(hugepd_t hpd) +{ + return hpd_val(hpd) & HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, + unsigned int pdshift) +{ + /* + * On FSL BookE, we have multiple higher-level table entries that + * point to the same hugepte. Just use the first one since they're all + * identical. So for that case, idx=0. + */ + return hugepd_page(hpd); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); + +#endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ From 5fb84fec46015758271fcd2a746633fd4d48e619 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:45 +0000 Subject: [PATCH 099/205] powerpc/mm: add a helper to populate hugepd This patchs adds a subarch helper to populate hugepd. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 5 +++++ .../include/asm/nohash/32/hugetlb-8xx.h | 8 ++++++++ .../include/asm/nohash/hugetlb-book3e.h | 6 ++++++ arch/powerpc/mm/hugetlbpage.c | 20 +------------------ 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index cbc8153d6e0ef..def77a45e905b 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -100,6 +100,11 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, return hugepd_page(hpd) + idx; } +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | (shift_to_mmu_psize(pshift) << 2)); +} + void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); #endif diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 997f5b3d6b99b..75676885bec29 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H #define _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H +#define PAGE_SHIFT_8M 23 + static inline pte_t *hugepd_page(hugepd_t hpd) { BUG_ON(!hugepd_ok(hpd)); @@ -28,4 +30,10 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, flush_tlb_page(vma, vmaddr); } +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + *hpdp = __hugepd(__pa(new) | _PMD_USER | _PMD_PRESENT | + (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K)); +} + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h index e94f1cd048ee5..51439bcfe3138 100644 --- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h +++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h @@ -28,4 +28,10 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr, void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshift) +{ + /* We use the old format for PPC_FSL_BOOK3E */ + *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); +} + #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 95cc9f3d97e25..036f408cfcb02 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -26,12 +26,6 @@ #include #include -#define PAGE_SHIFT_64K 16 -#define PAGE_SHIFT_512K 19 -#define PAGE_SHIFT_8M 23 -#define PAGE_SHIFT_16M 24 -#define PAGE_SHIFT_16G 34 - bool hugetlb_disabled = false; unsigned int HPAGE_SHIFT; @@ -95,19 +89,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, for (i = 0; i < num_hugepd; i++, hpdp++) { if (unlikely(!hugepd_none(*hpdp))) break; - else { -#ifdef CONFIG_PPC_BOOK3S_64 - *hpdp = __hugepd(__pa(new) | HUGEPD_VAL_BITS | - (shift_to_mmu_psize(pshift) << 2)); -#elif defined(CONFIG_PPC_8xx) - *hpdp = __hugepd(__pa(new) | _PMD_USER | - (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : - _PMD_PAGE_512K) | _PMD_PRESENT); -#else - /* We use the old format for PPC_FSL_BOOK3E */ - *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); -#endif - } + hugepd_populate(hpdp, new, pshift); } /* If we bailed from the for loop early, an error occurred, clean up */ if (i < num_hugepd) { From 723f268f19daddba56a987b934f3e34a04b6499d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:46 +0000 Subject: [PATCH 100/205] powerpc/mm: cleanup ifdef mess in add_huge_page_size() Introduce a subarch specific helper check_and_get_huge_psize() to check the huge page sizes and cleanup the ifdef mess in add_huge_page_size() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 27 ++++++++++++++ .../include/asm/nohash/32/hugetlb-8xx.h | 5 +++ .../include/asm/nohash/hugetlb-book3e.h | 8 ++++ arch/powerpc/mm/hugetlbpage.c | 37 ++----------------- 4 files changed, 43 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index def77a45e905b..56140d19c85fc 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -107,4 +107,31 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +static inline int check_and_get_huge_psize(int shift) +{ + int mmu_psize; + + if (shift > SLICE_HIGH_SHIFT) + return -EINVAL; + + mmu_psize = shift_to_mmu_psize(shift); + + /* + * We need to make sure that for different page sizes reported by + * firmware we only add hugetlb support for page sizes that can be + * supported by linux page table layout. + * For now we have + * Radix: 2M and 1G + * Hash: 16M and 16G + */ + if (radix_enabled()) { + if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) + return -EINVAL; + } else { + if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) + return -EINVAL; + } + return mmu_psize; +} + #endif diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 75676885bec29..a46616937d205 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -36,4 +36,9 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi (pshift == PAGE_SHIFT_8M ? _PMD_PAGE_8M : _PMD_PAGE_512K)); } +static inline int check_and_get_huge_psize(int shift) +{ + return shift_to_mmu_psize(shift); +} + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h index 51439bcfe3138..ecd8694cb229b 100644 --- a/arch/powerpc/include/asm/nohash/hugetlb-book3e.h +++ b/arch/powerpc/include/asm/nohash/hugetlb-book3e.h @@ -34,4 +34,12 @@ static inline void hugepd_populate(hugepd_t *hpdp, pte_t *new, unsigned int pshi *hpdp = __hugepd(((unsigned long)new & ~PD_HUGE) | pshift); } +static inline int check_and_get_huge_psize(int shift) +{ + if (shift & 1) /* Not a power of 4 */ + return -EINVAL; + + return shift_to_mmu_psize(shift); +} + #endif /* _ASM_POWERPC_NOHASH_HUGETLB_BOOK3E_H */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 036f408cfcb02..7b7027aae73fc 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -549,13 +549,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return vma_kernel_pagesize(vma); } -static inline bool is_power_of_4(unsigned long x) -{ - if (is_power_of_2(x)) - return (__ilog2(x) % 2) ? false : true; - return false; -} - static int __init add_huge_page_size(unsigned long long size) { int shift = __ffs(size); @@ -563,37 +556,13 @@ static int __init add_huge_page_size(unsigned long long size) /* Check that it is a page size supported by the hardware and * that it fits within pagetable and slice limits. */ - if (size <= PAGE_SIZE) - return -EINVAL; -#if defined(CONFIG_PPC_FSL_BOOK3E) - if (!is_power_of_4(size)) + if (size <= PAGE_SIZE || !is_power_of_2(size)) return -EINVAL; -#elif !defined(CONFIG_PPC_8xx) - if (!is_power_of_2(size) || (shift > SLICE_HIGH_SHIFT)) - return -EINVAL; -#endif - if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + mmu_psize = check_and_get_huge_psize(size); + if (mmu_psize < 0) return -EINVAL; -#ifdef CONFIG_PPC_BOOK3S_64 - /* - * We need to make sure that for different page sizes reported by - * firmware we only add hugetlb support for page sizes that can be - * supported by linux page table layout. - * For now we have - * Radix: 2M and 1G - * Hash: 16M and 16G - */ - if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M && mmu_psize != MMU_PAGE_1G) - return -EINVAL; - } else { - if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) - return -EINVAL; - } -#endif - BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ From 45d0ba527b575d47b2be75dd517b57cceda04bfe Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:47 +0000 Subject: [PATCH 101/205] powerpc/mm: move hugetlb_disabled into asm/hugetlb.h No need to have this in asm/page.h, move it into asm/hugetlb.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hugetlb.h | 2 ++ arch/powerpc/include/asm/page.h | 1 - arch/powerpc/kernel/fadump.c | 1 + arch/powerpc/mm/book3s64/hash_utils.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index fd5c0873a57d8..84598c6b09595 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -13,6 +13,8 @@ #include #endif /* CONFIG_PPC_BOOK3S_64 */ +extern bool hugetlb_disabled; + void flush_dcache_icache_hugepage(struct page *page); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 748f5db2e2b74..6b508420d92be 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -29,7 +29,6 @@ #ifndef __ASSEMBLY__ #ifdef CONFIG_HUGETLB_PAGE -extern bool hugetlb_disabled; extern unsigned int HPAGE_SHIFT; #else #define HPAGE_SHIFT PAGE_SHIFT diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 45a8d0be1c96d..25f063f56ec56 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 23ed8db645adf..f0ce860a69ace 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include From c5710cd20735037ba9be0e95530f0d3795ce07e6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:48 +0000 Subject: [PATCH 102/205] powerpc/mm: cleanup HPAGE_SHIFT setup Only book3s/64 may select default among several HPAGE_SHIFT at runtime. 8xx always defines 512K pages as default FSL_BOOK3E always defines 4M pages as default This patch limits HUGETLB_PAGE_SIZE_VARIABLE to book3s/64 moves the definitions in subarches files. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hugetlb.h | 2 ++ arch/powerpc/include/asm/page.h | 11 +++++++--- arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 16 ++++++++++++++ arch/powerpc/mm/hugetlbpage.c | 23 +++------------------ 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5d8e692d64703..7815eb0cc2a5b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -390,7 +390,7 @@ source "kernel/Kconfig.hz" config HUGETLB_PAGE_SIZE_VARIABLE bool - depends on HUGETLB_PAGE + depends on HUGETLB_PAGE && PPC_BOOK3S_64 default y config MATH_EMULATION diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 84598c6b09595..20a101046cffc 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -15,6 +15,8 @@ extern bool hugetlb_disabled; +void hugetlbpage_init_default(void); + void flush_dcache_icache_hugepage(struct page *page); int slice_is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 6b508420d92be..dbc8c0679480c 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -28,10 +28,15 @@ #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #ifndef __ASSEMBLY__ -#ifdef CONFIG_HUGETLB_PAGE -extern unsigned int HPAGE_SHIFT; -#else +#ifndef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PAGE_SHIFT +#elif defined(CONFIG_PPC_BOOK3S_64) +extern unsigned int hpage_shift; +#define HPAGE_SHIFT hpage_shift +#elif defined(CONFIG_PPC_8xx) +#define HPAGE_SHIFT 19 /* 512k pages */ +#elif defined(CONFIG_PPC_FSL_BOOK3E) +#define HPAGE_SHIFT 22 /* 4M pages */ #endif #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c index 2d4e02aa15a37..eefa89c6117bd 100644 --- a/arch/powerpc/mm/book3s64/hash_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hash_hugetlbpage.c @@ -15,6 +15,9 @@ #include #include +unsigned int hpage_shift; +EXPORT_SYMBOL(hpage_shift); + extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, unsigned long pa, unsigned long rlags, unsigned long vflags, int psize, int ssize); @@ -150,3 +153,16 @@ void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr old_pte, pte); set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } + +void hugetlbpage_init_default(void) +{ + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7b7027aae73fc..847fb495a6289 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -28,9 +28,6 @@ bool hugetlb_disabled = false; -unsigned int HPAGE_SHIFT; -EXPORT_SYMBOL(HPAGE_SHIFT); - #define hugepd_none(hpd) (hpd_val(hpd) == 0) #define PTE_T_ORDER (__builtin_ffs(sizeof(pte_t)) - __builtin_ffs(sizeof(void *))) @@ -645,23 +642,9 @@ static int __init hugetlbpage_init(void) #endif } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - /* Default hpage size = 4M on FSL_BOOK3E and 512k on 8xx */ - if (mmu_psize_defs[MMU_PAGE_4M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; - else if (mmu_psize_defs[MMU_PAGE_512K].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_512K].shift; -#else - /* Set default large page size. Currently, we pick 16M or 1M - * depending on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; - else if (mmu_psize_defs[MMU_PAGE_2M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; -#endif + if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) + hugetlbpage_init_default(); + return 0; } From 4df4b27585227c8ba66fdf0dd7531d1e23a37194 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:49 +0000 Subject: [PATCH 103/205] powerpc/mm: cleanup remaining ifdef mess in hugetlbpage.c Only 3 subarches support huge pages. So when it is either 2 of them, it is not the third one. And mmu_has_feature() is known by all subarches so IS_ENABLED() can be used instead of #ifdef Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 847fb495a6289..98db5ec6a1ddf 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -226,7 +226,7 @@ int __init alloc_bootmem_huge_page(struct hstate *h) return __alloc_bootmem_huge_page(h); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) +#ifndef CONFIG_PPC_BOOK3S_64 #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) @@ -595,10 +595,10 @@ static int __init hugetlbpage_init(void) return 0; } -#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx) - if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled() && + !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; -#endif + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { unsigned shift; unsigned pdshift; @@ -636,10 +636,8 @@ static int __init hugetlbpage_init(void) pgtable_cache_add(PTE_INDEX_SIZE); else if (pdshift > shift) pgtable_cache_add(pdshift - shift); -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) - else + else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx)) pgtable_cache_add(PTE_T_ORDER); -#endif } if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) From fab9a1165bcda99682e3319d1c83980fd4e72365 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:51 +0000 Subject: [PATCH 104/205] powerpc/mm: flatten function __find_linux_pte() step 1 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. This patch flattens the function by getting rid of as much if/else as possible. In order to ease the review, this is done in three steps. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 9f4ccd15849f0..d332abeedf0a4 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -339,12 +339,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, */ if (pgd_none(pgd)) return NULL; - else if (pgd_huge(pgd)) { - ret_pte = (pte_t *) pgdp; + + if (pgd_huge(pgd)) { + ret_pte = (pte_t *)pgdp; goto out; - } else if (is_hugepd(__hugepd(pgd_val(pgd)))) + } + if (is_hugepd(__hugepd(pgd_val(pgd)))) { hpdp = (hugepd_t *)&pgd; - else { + goto out_huge; + } + { /* * Even if we end up with an unmap, the pgtable will not * be freed, because we do an rcu free and here we are @@ -356,12 +360,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pud_none(pud)) return NULL; - else if (pud_huge(pud)) { + + if (pud_huge(pud)) { ret_pte = (pte_t *) pudp; goto out; - } else if (is_hugepd(__hugepd(pud_val(pud)))) + } + if (is_hugepd(__hugepd(pud_val(pud)))) { hpdp = (hugepd_t *)&pud; - else { + goto out_huge; + } + { pdshift = PMD_SHIFT; pmdp = pmd_offset(&pud, ea); pmd = READ_ONCE(*pmdp); @@ -386,12 +394,16 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, if (pmd_huge(pmd) || pmd_large(pmd)) { ret_pte = (pte_t *) pmdp; goto out; - } else if (is_hugepd(__hugepd(pmd_val(pmd)))) + } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { hpdp = (hugepd_t *)&pmd; - else - return pte_offset_kernel(&pmd, ea); + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); } } +out_huge: if (!hpdp) return NULL; From e2fb2511888b3f7768835de0768c24d1e0d74590 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:52 +0000 Subject: [PATCH 105/205] powerpc/mm: flatten function __find_linux_pte() step 2 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. Previous patch left { } blocks. This patch removes the first one by shifting its content to the left. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 62 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index d332abeedf0a4..c1c6d0b79baa4 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -369,39 +369,37 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, hpdp = (hugepd_t *)&pud; goto out_huge; } - { - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *) pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *) pmdp; - goto out; - } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } - - return pte_offset_kernel(&pmd, ea); + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { + hpdp = (hugepd_t *)&pmd; + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); } out_huge: if (!hpdp) From 26e66b08c3376b6fb4ad4508d48a4e74d61f0b9b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 05:59:53 +0000 Subject: [PATCH 106/205] powerpc/mm: flatten function __find_linux_pte() step 3 __find_linux_pte() is full of if/else which is hard to follow allthough the handling is pretty simple. Previous patches left a { } block. This patch removes it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 98 +++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index c1c6d0b79baa4..db4a6253df92a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -348,59 +348,59 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, hpdp = (hugepd_t *)&pgd; goto out_huge; } - { - /* - * Even if we end up with an unmap, the pgtable will not - * be freed, because we do an rcu free and here we are - * irq disabled - */ - pdshift = PUD_SHIFT; - pudp = pud_offset(&pgd, ea); - pud = READ_ONCE(*pudp); - if (pud_none(pud)) - return NULL; + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); - if (pud_huge(pud)) { - ret_pte = (pte_t *) pudp; - goto out; - } - if (is_hugepd(__hugepd(pud_val(pud)))) { - hpdp = (hugepd_t *)&pud; - goto out_huge; - } - pdshift = PMD_SHIFT; - pmdp = pmd_offset(&pud, ea); - pmd = READ_ONCE(*pmdp); - /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. - */ - if (pmd_none(pmd)) - return NULL; - - if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { - if (is_thp) - *is_thp = true; - ret_pte = (pte_t *)pmdp; - goto out; - } - /* - * pmd_large check below will handle the swap pmd pte - * we need to do both the check because they are config - * dependent. - */ - if (pmd_huge(pmd) || pmd_large(pmd)) { - ret_pte = (pte_t *)pmdp; - goto out; - } - if (is_hugepd(__hugepd(pmd_val(pmd)))) { - hpdp = (hugepd_t *)&pmd; - goto out_huge; - } + if (pud_none(pud)) + return NULL; - return pte_offset_kernel(&pmd, ea); + if (pud_huge(pud)) { + ret_pte = (pte_t *)pudp; + goto out; } + if (is_hugepd(__hugepd(pud_val(pud)))) { + hpdp = (hugepd_t *)&pud; + goto out_huge; + } + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { + if (is_thp) + *is_thp = true; + ret_pte = (pte_t *)pmdp; + goto out; + } + /* + * pmd_large check below will handle the swap pmd pte + * we need to do both the check because they are config + * dependent. + */ + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *)pmdp; + goto out; + } + if (is_hugepd(__hugepd(pmd_val(pmd)))) { + hpdp = (hugepd_t *)&pmd; + goto out_huge; + } + + return pte_offset_kernel(&pmd, ea); + out_huge: if (!hpdp) return NULL; From 447def3b06adab60b999417b316bd2352d7e643e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:57:59 +0000 Subject: [PATCH 107/205] powerpc/mm: drop __bad_pte() This has never been called (since Kernel has been in git at least), drop it. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 2 -- arch/powerpc/include/asm/nohash/32/pgalloc.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 3633502e102ce..645af86cd072b 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -22,8 +22,6 @@ */ #define MAX_PGTABLE_INDEX_SIZE 0xf -extern void __bad_pte(pmd_t *pmd); - extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index bd186e85b4f70..ea265a578eb06 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -22,8 +22,6 @@ */ #define MAX_PGTABLE_INDEX_SIZE 0xf -extern void __bad_pte(pmd_t *pmd); - extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] From 737b434d3d55c0b3c23df4eab1ea5b33f8850f30 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:01 +0000 Subject: [PATCH 108/205] powerpc/mm: convert Book3E 64 to pte_fragment Book3E 64 is the only subarch not using pte_fragment. In order to allow refactorisation, this patch converts it to pte_fragment. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 6 ---- arch/powerpc/include/asm/nohash/64/mmu.h | 4 ++- arch/powerpc/include/asm/nohash/64/pgalloc.h | 33 +++++++------------- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmu_context.c | 2 +- 5 files changed, 17 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 6ee8195a2ffb4..66a3805dc9352 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -228,13 +228,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, #endif } -#ifdef CONFIG_PPC_BOOK3E_64 -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} -#else extern void arch_exit_mmap(struct mm_struct *mm); -#endif static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h index 81cf30c370e59..26e05ce8f5aab 100644 --- a/arch/powerpc/include/asm/nohash/64/mmu.h +++ b/arch/powerpc/include/asm/nohash/64/mmu.h @@ -4,11 +4,13 @@ #define MAX_PHYSMEM_BITS 44 +#include + /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ #include #ifndef __ASSEMBLY__ -typedef struct page *pgtable_t; +typedef pte_t *pgtable_t; #endif #endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index ded453f9b5a8a..7fb87235f8450 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -76,10 +76,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, (unsigned long)page_address(pte_page)); + pmd_set(pmd, (unsigned long)pte_page); } -#define pmd_pgtable(pmd) pmd_page(pmd) +#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -92,44 +92,35 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + return (pte_t *)pte_fragment_alloc(mm, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page; - pte_t *pte; - - pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT); - if (!pte) - return NULL; - page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; + return (pgtable_t)pte_fragment_alloc(mm, 0); } +void pte_frag_destroy(void *pte_frag); +void pte_fragment_free(unsigned long *table, int kernel); + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - free_page((unsigned long)pte); + pte_fragment_free((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - pgtable_page_dtor(ptepage); - __free_page(ptepage); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void pgtable_free(void *table, int shift) { if (!shift) { - pgtable_page_dtor(virt_to_page(table)); - free_page((unsigned long)table); + pte_fragment_free((unsigned long *)table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -166,7 +157,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { tlb_flush_pgtable(tlb, address); - pgtable_free_tlb(tlb, page_address(table), 0); + pgtable_free_tlb(tlb, table, 0); } #define __pmd_free_tlb(tlb, pmd, addr) \ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3daea8da0c7fd..c7d5f37f7c52c 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,12 +7,12 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ + pgtable-frag.o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o -obj-$(CONFIG_PPC32) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index bb52320b7369e..6b049d82b98ac 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -98,7 +98,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, switch_mmu_context(prev, next, tsk); } -#ifdef CONFIG_PPC32 +#ifndef CONFIG_PPC_BOOK3S_64 void arch_exit_mmap(struct mm_struct *mm) { void *frag = pte_frag_get(&mm->context); From 696dffa24bd0e17c8bccb18467555c17cc15e62c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:02 +0000 Subject: [PATCH 109/205] powerpc/mm: move pgtable_t in asm/mmu.h pgtable_t is now identical for all subarches, move it to the top level asm/mmu.h Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 4 ---- arch/powerpc/include/asm/book3s/64/mmu.h | 8 -------- arch/powerpc/include/asm/mmu.h | 3 +++ arch/powerpc/include/asm/nohash/32/mmu.h | 6 ------ arch/powerpc/include/asm/nohash/64/mmu.h | 6 ------ 5 files changed, 3 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index f9eae105a9f4c..2e277ca0170fb 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -10,8 +10,6 @@ * BATs */ -#include - /* Block size masks */ #define BL_128K 0x000 #define BL_256K 0x001 @@ -49,8 +47,6 @@ struct ppc_bat { u32 batu; u32 batl; }; - -typedef pte_t *pgtable_t; #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 51b2d60efc1ba..74d24201fc4f3 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -25,14 +25,6 @@ struct mmu_psize_def { }; }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; - -/* - * For BOOK3s 64 with 4k and 64K linux page size - * we want to use pointers, because the page table - * actually store pfn - */ -typedef pte_t *pgtable_t; - #endif /* __ASSEMBLY__ */ /* diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index d86c5641bd971..ba94ce8c22d7b 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -129,6 +129,9 @@ #ifndef __ASSEMBLY__ #include #include +#include + +typedef pte_t *pgtable_t; #ifdef CONFIG_PPC_FSL_BOOK3E #include diff --git a/arch/powerpc/include/asm/nohash/32/mmu.h b/arch/powerpc/include/asm/nohash/32/mmu.h index 7d94a36d57d27..af0e8b54876ab 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu.h +++ b/arch/powerpc/include/asm/nohash/32/mmu.h @@ -2,8 +2,6 @@ #ifndef _ASM_POWERPC_NOHASH_32_MMU_H_ #define _ASM_POWERPC_NOHASH_32_MMU_H_ -#include - #if defined(CONFIG_40x) /* 40x-style software loaded TLB */ #include @@ -18,8 +16,4 @@ #include #endif -#ifndef __ASSEMBLY__ -typedef pte_t *pgtable_t; -#endif - #endif /* _ASM_POWERPC_NOHASH_32_MMU_H_ */ diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h index 26e05ce8f5aab..e490ecdac012a 100644 --- a/arch/powerpc/include/asm/nohash/64/mmu.h +++ b/arch/powerpc/include/asm/nohash/64/mmu.h @@ -4,13 +4,7 @@ #define MAX_PHYSMEM_BITS 44 -#include - /* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ #include -#ifndef __ASSEMBLY__ -typedef pte_t *pgtable_t; -#endif - #endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */ From 7a792d5da27f8407c5fe1b3c976106229e0d8bbd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:03 +0000 Subject: [PATCH 110/205] powerpc/mm: get rid of nohash/32/mmu.h and nohash/64/mmu.h Those files have no real added values, especially the 64 bit which only includes the common book3e mmu.h which is also included from 32 bits side. So lets do the final inclusion directly from nohash/mmu.h Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/mmu.h | 19 ------------------- arch/powerpc/include/asm/nohash/64/mmu.h | 10 ---------- arch/powerpc/include/asm/nohash/mmu-book3e.h | 2 ++ arch/powerpc/include/asm/nohash/mmu.h | 16 ++++++++++++---- 4 files changed, 14 insertions(+), 33 deletions(-) delete mode 100644 arch/powerpc/include/asm/nohash/32/mmu.h delete mode 100644 arch/powerpc/include/asm/nohash/64/mmu.h diff --git a/arch/powerpc/include/asm/nohash/32/mmu.h b/arch/powerpc/include/asm/nohash/32/mmu.h deleted file mode 100644 index af0e8b54876ab..0000000000000 --- a/arch/powerpc/include/asm/nohash/32/mmu.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_32_MMU_H_ -#define _ASM_POWERPC_NOHASH_32_MMU_H_ - -#if defined(CONFIG_40x) -/* 40x-style software loaded TLB */ -#include -#elif defined(CONFIG_44x) -/* 44x-style software loaded TLB */ -#include -#elif defined(CONFIG_PPC_BOOK3E_MMU) -/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ -#include -#elif defined (CONFIG_PPC_8xx) -/* Motorola/Freescale 8xx software loaded TLB */ -#include -#endif - -#endif /* _ASM_POWERPC_NOHASH_32_MMU_H_ */ diff --git a/arch/powerpc/include/asm/nohash/64/mmu.h b/arch/powerpc/include/asm/nohash/64/mmu.h deleted file mode 100644 index e490ecdac012a..0000000000000 --- a/arch/powerpc/include/asm/nohash/64/mmu.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_POWERPC_NOHASH_64_MMU_H_ -#define _ASM_POWERPC_NOHASH_64_MMU_H_ - -#define MAX_PHYSMEM_BITS 44 - -/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ -#include - -#endif /* _ASM_POWERPC_NOHASH_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/nohash/mmu-book3e.h b/arch/powerpc/include/asm/nohash/mmu-book3e.h index e20072972e359..4c9777d256fbe 100644 --- a/arch/powerpc/include/asm/nohash/mmu-book3e.h +++ b/arch/powerpc/include/asm/nohash/mmu-book3e.h @@ -306,6 +306,8 @@ extern int book3e_htw_mode; #define mmu_cleanup_all NULL +#define MAX_PHYSMEM_BITS 44 + #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/mmu.h b/arch/powerpc/include/asm/nohash/mmu.h index a037cb1efb57e..edc793e5f08f9 100644 --- a/arch/powerpc/include/asm/nohash/mmu.h +++ b/arch/powerpc/include/asm/nohash/mmu.h @@ -2,10 +2,18 @@ #ifndef _ASM_POWERPC_NOHASH_MMU_H_ #define _ASM_POWERPC_NOHASH_MMU_H_ -#ifdef CONFIG_PPC64 -#include -#else -#include +#if defined(CONFIG_40x) +/* 40x-style software loaded TLB */ +#include +#elif defined(CONFIG_44x) +/* 44x-style software loaded TLB */ +#include +#elif defined(CONFIG_PPC_BOOK3E_MMU) +/* Freescale Book-E software loaded TLB or Book-3e (ISA 2.06+) MMU */ +#include +#elif defined (CONFIG_PPC_8xx) +/* Motorola/Freescale 8xx software loaded TLB */ +#include #endif #endif /* _ASM_POWERPC_NOHASH_MMU_H_ */ From e7a7be5679a5c5f1817226d8253d971520038b67 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:04 +0000 Subject: [PATCH 111/205] powerpc/Kconfig: select PPC_MM_SLICES from subarch type Lets select PPC_MM_SLICES from the subarch config item instead of doing it via defaults declaration in the PPC_MM_SLICES item itself. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/Kconfig.cputype | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index cd28b045c0f3c..fa6b03205ae18 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -38,6 +38,7 @@ config PPC_8xx select SYS_SUPPORTS_HUGETLBFS select PPC_HAVE_KUEP select PPC_HAVE_KUAP + select PPC_MM_SLICES if HUGETLB_PAGE config 40x bool "AMCC 40x" @@ -79,6 +80,7 @@ config PPC_BOOK3S_64 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK + select PPC_MM_SLICES config PPC_BOOK3E_64 bool "Embedded processors" @@ -401,8 +403,6 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if PPC_BOOK3S_64 - default y if PPC_8xx && HUGETLB_PAGE config PPC_HAVE_PMU_SUPPORT bool From 627f06c6f51e6af6ca3f7d1e82154b59583abc15 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:05 +0000 Subject: [PATCH 112/205] powerpc/book3e: move early_alloc_pgtable() to init section early_alloc_pgtable() is only used during init. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/nohash/book3e_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index f296c2e88b09c..75e9e2c35fe2f 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -55,7 +55,7 @@ void vmemmap_remove_mapping(unsigned long start, #endif #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -static __ref void *early_alloc_pgtable(unsigned long size) +static void __init *early_alloc_pgtable(unsigned long size) { void *ptr; @@ -74,7 +74,7 @@ static __ref void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) { pgd_t *pgdp; pud_t *pudp; From 4a6d8cf90017019f3b2829b38157cd1a74c64856 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:06 +0000 Subject: [PATCH 113/205] powerpc/mm: don't use pte_alloc_kernel() until slab is available on PPC32 In the same way as PPC64, implement early allocation functions and avoid calling pte_alloc_kernel() before slab is available. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_32.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index c9cdbb84d31f8..e54b612cbc984 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,11 +43,8 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - if (!slab_is_available()) - return memblock_alloc(PTE_FRAG_SIZE, PTE_FRAG_SIZE); - return (pte_t *)pte_fragment_alloc(mm, 1); } @@ -205,7 +202,29 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) +static void __init *early_alloc_pgtable(unsigned long size) +{ + void *ptr = memblock_alloc(size, size); + + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, size); + + return ptr; +} + +static pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) +{ + if (pmd_none(*pmdp)) { + pte_t *ptep = early_alloc_pgtable(PTE_FRAG_SIZE); + + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + return pte_offset_kernel(pmdp, va); +} + + +int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) { pmd_t *pd; pte_t *pg; @@ -214,7 +233,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) /* Use upper 10 bits of VA to index the first level map */ pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(pd, va); + if (likely(slab_is_available())) + pg = pte_alloc_kernel(pd, va); + else + pg = early_pte_alloc_kernel(pd, va); if (pg != 0) { err = 0; /* The PTE should never be already set nor present in the From b0124ff57e9405725b4dfeffbdfa929bb973ad2c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:07 +0000 Subject: [PATCH 114/205] powerpc/mm: inline pte_alloc_one_kernel() and pte_alloc_one() on PPC32 pte_alloc_one_kernel() and pte_alloc_one() are simple calls to pte_fragment_alloc(), so they are good candidates for inlining as already done on PPC64. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 15 ++++++++++++--- arch/powerpc/include/asm/nohash/32/pgalloc.h | 15 ++++++++++++--- arch/powerpc/mm/pgtable_32.c | 10 ---------- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 645af86cd072b..0ed856068bb84 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -59,10 +59,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -extern pgtable_t pte_alloc_one(struct mm_struct *mm); -void pte_frag_destroy(void *pte_frag); pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)pte_fragment_alloc(mm, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return (pgtable_t)pte_fragment_alloc(mm, 0); +} + +void pte_frag_destroy(void *pte_frag); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index ea265a578eb06..1d41508f06763 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -77,10 +77,19 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) #endif -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -extern pgtable_t pte_alloc_one(struct mm_struct *mm); -void pte_frag_destroy(void *pte_frag); pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)pte_fragment_alloc(mm, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return (pgtable_t)pte_fragment_alloc(mm, 0); +} + +void pte_frag_destroy(void *pte_frag); void pte_fragment_free(unsigned long *table, int kernel); static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index e54b612cbc984..2e67f9a1430b3 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -43,16 +43,6 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - void __iomem * ioremap(phys_addr_t addr, unsigned long size) { From dc096864ba784c2d3d10480d71f14a53f40f997c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:08 +0000 Subject: [PATCH 115/205] powerpc/mm: refactor pte_alloc_one() and pte_free() families definition. Functions pte_alloc_one(), pte_alloc_one_kernel(), pte_free(), pte_free_kernel() are identical for the four subarches. This patch moves their definition in a common place. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 25 -------------------- arch/powerpc/include/asm/book3s/64/pgalloc.h | 22 ----------------- arch/powerpc/include/asm/nohash/32/pgalloc.h | 25 -------------------- arch/powerpc/include/asm/nohash/64/pgalloc.h | 25 -------------------- arch/powerpc/include/asm/pgalloc.h | 25 ++++++++++++++++++++ 5 files changed, 25 insertions(+), 97 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 0ed856068bb84..46422309d6e0e 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -59,31 +59,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - -void pte_frag_destroy(void *pte_frag); -void pte_fragment_free(unsigned long *table, int kernel); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - pte_fragment_free((unsigned long *)pte, 1); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pte_fragment_free((unsigned long *)ptepage, 0); -} - static inline void pgtable_free(void *table, unsigned index_size) { if (!index_size) { diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 138bc2ecc0c4b..cfd48d8cc055c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -39,9 +39,7 @@ extern struct vmemmap_backing *vmemmap_list; extern struct kmem_cache *pgtable_cache[]; #define PGT_CACHE(shift) pgtable_cache[shift] -extern pte_t *pte_fragment_alloc(struct mm_struct *, int); extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long); -extern void pte_fragment_free(unsigned long *, int); extern void pmd_fragment_free(unsigned long *); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP @@ -190,26 +188,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - pte_fragment_free((unsigned long *)pte, 1); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pte_fragment_free((unsigned long *)ptepage, 0); -} - static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 1d41508f06763..e96ef2fde2ca0 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -77,31 +77,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) #endif -pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - -void pte_frag_destroy(void *pte_frag); -void pte_fragment_free(unsigned long *table, int kernel); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - pte_fragment_free((unsigned long *)pte, 1); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pte_fragment_free((unsigned long *)ptepage, 0); -} - static inline void pgtable_free(void *table, unsigned index_size) { if (!index_size) { diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 7fb87235f8450..98de4f3b03061 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -92,31 +92,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } -pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - return (pte_t *)pte_fragment_alloc(mm, 1); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm) -{ - return (pgtable_t)pte_fragment_alloc(mm, 0); -} - -void pte_frag_destroy(void *pte_frag); -void pte_fragment_free(unsigned long *table, int kernel); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - pte_fragment_free((unsigned long *)pte, 1); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pte_fragment_free((unsigned long *)ptepage, 0); -} - static inline void pgtable_free(void *table, int shift) { if (!shift) { diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index e11f03007b575..c2c6fd438840f 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -20,6 +20,31 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) #define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) +pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel); + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +{ + return (pte_t *)pte_fragment_alloc(mm, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return (pgtable_t)pte_fragment_alloc(mm, 0); +} + +void pte_frag_destroy(void *pte_frag); +void pte_fragment_free(unsigned long *table, int kernel); + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + pte_fragment_free((unsigned long *)pte, 1); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pte_fragment_free((unsigned long *)ptepage, 0); +} + #ifdef CONFIG_PPC_BOOK3S #include #else From e80789a3c13f9fbc8f361a988868f9b68a8cf134 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:09 +0000 Subject: [PATCH 116/205] powerpc/mm: refactor definition of pgtable_cache[] pgtable_cache[] is the same for the 4 subarches, lets make it common. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 21 ------------------- arch/powerpc/include/asm/book3s/64/pgalloc.h | 22 -------------------- arch/powerpc/include/asm/nohash/32/pgalloc.h | 21 ------------------- arch/powerpc/include/asm/nohash/64/pgalloc.h | 22 -------------------- arch/powerpc/include/asm/pgalloc.h | 21 +++++++++++++++++++ 5 files changed, 21 insertions(+), 86 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 46422309d6e0e..1b9b5c2282300 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -5,26 +5,6 @@ #include #include -/* - * Functions that deal with pagetables that could be at any level of - * the table need to be passed an "index_size" so they know how to - * handle allocation. For PTE pages (which are linked to a struct - * page for now, and drawn from the main get_free_pages() pool), the - * allocation size will be (2^index_size * sizeof(pointer)) and - * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). - * - * The maximum index size needs to be big enough to allow any - * pagetable sizes we need, but small enough to fit in the low bits of - * any page table pointer. In other words all pagetables, even tiny - * ones, must be aligned to allow at least enough low 0 bits to - * contain this value. This value is also used as a mask, so it must - * be one less than a power of two. - */ -#define MAX_PGTABLE_INDEX_SIZE 0xf - -extern struct kmem_cache *pgtable_cache[]; -#define PGT_CACHE(shift) pgtable_cache[shift] - static inline pgd_t *pgd_alloc(struct mm_struct *mm) { return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), @@ -69,7 +49,6 @@ static inline void pgtable_free(void *table, unsigned index_size) } } -#define check_pgt_cache() do { } while (0) #define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index cfd48d8cc055c..df2dce6afe14f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -19,26 +19,6 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -/* - * Functions that deal with pagetables that could be at any level of - * the table need to be passed an "index_size" so they know how to - * handle allocation. For PTE pages (which are linked to a struct - * page for now, and drawn from the main get_free_pages() pool), the - * allocation size will be (2^index_size * sizeof(pointer)) and - * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). - * - * The maximum index size needs to be big enough to allow any - * pagetable sizes we need, but small enough to fit in the low bits of - * any page table pointer. In other words all pagetables, even tiny - * ones, must be aligned to allow at least enough low 0 bits to - * contain this value. This value is also used as a mask, so it must - * be one less than a power of two. - */ -#define MAX_PGTABLE_INDEX_SIZE 0xf - -extern struct kmem_cache *pgtable_cache[]; -#define PGT_CACHE(shift) pgtable_cache[shift] - extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long); extern void pmd_fragment_free(unsigned long *); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); @@ -199,8 +179,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, pgtable_free_tlb(tlb, table, PTE_INDEX); } -#define check_pgt_cache() do { } while (0) - extern atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; static inline void update_page_count(int psize, long count) { diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index e96ef2fde2ca0..4615801aa953d 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -5,26 +5,6 @@ #include #include -/* - * Functions that deal with pagetables that could be at any level of - * the table need to be passed an "index_size" so they know how to - * handle allocation. For PTE pages (which are linked to a struct - * page for now, and drawn from the main get_free_pages() pool), the - * allocation size will be (2^index_size * sizeof(pointer)) and - * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). - * - * The maximum index size needs to be big enough to allow any - * pagetable sizes we need, but small enough to fit in the low bits of - * any page table pointer. In other words all pagetables, even tiny - * ones, must be aligned to allow at least enough low 0 bits to - * contain this value. This value is also used as a mask, so it must - * be one less than a power of two. - */ -#define MAX_PGTABLE_INDEX_SIZE 0xf - -extern struct kmem_cache *pgtable_cache[]; -#define PGT_CACHE(shift) pgtable_cache[shift] - static inline pgd_t *pgd_alloc(struct mm_struct *mm) { return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), @@ -87,7 +67,6 @@ static inline void pgtable_free(void *table, unsigned index_size) } } -#define check_pgt_cache() do { } while (0) #define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 98de4f3b03061..ffc86d42816da 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -18,26 +18,6 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -/* - * Functions that deal with pagetables that could be at any level of - * the table need to be passed an "index_size" so they know how to - * handle allocation. For PTE pages (which are linked to a struct - * page for now, and drawn from the main get_free_pages() pool), the - * allocation size will be (2^index_size * sizeof(pointer)) and - * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). - * - * The maximum index size needs to be big enough to allow any - * pagetable sizes we need, but small enough to fit in the low bits of - * any page table pointer. In other words all pagetables, even tiny - * ones, must be aligned to allow at least enough low 0 bits to - * contain this value. This value is also used as a mask, so it must - * be one less than a power of two. - */ -#define MAX_PGTABLE_INDEX_SIZE 0xf - -extern struct kmem_cache *pgtable_cache[]; -#define PGT_CACHE(shift) pgtable_cache[shift] - static inline pgd_t *pgd_alloc(struct mm_struct *mm) { return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), @@ -140,6 +120,4 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#define check_pgt_cache() do { } while (0) - #endif /* _ASM_POWERPC_PGALLOC_64_H */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index c2c6fd438840f..5761bee0f0049 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -45,6 +45,27 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) pte_fragment_free((unsigned long *)ptepage, 0); } +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages, the allocation size will be + * (2^index_size * sizeof(pointer)) and allocations are drawn from + * the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + +extern struct kmem_cache *pgtable_cache[]; +#define PGT_CACHE(shift) pgtable_cache[shift] + +static inline void check_pgt_cache(void) { } + #ifdef CONFIG_PPC_BOOK3S #include #else From bf8156c5aef12621e20afa470ae41f92cdca377b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:10 +0000 Subject: [PATCH 117/205] powerpc/mm: Only keep one version of pmd_populate() functions on nohash/32 Use IS_ENABLED(CONFIG_BOOKE) to make single versions of pmd_populate() and pmd_populate_kernel() Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgalloc.h | 28 ++++++-------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 4615801aa953d..7ee8e27070f47 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -25,37 +25,25 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #define __pmd_free_tlb(tlb,x,a) do { } while (0) /* #define pgd_populate(mm, pmd, pte) BUG() */ -#ifndef CONFIG_BOOKE - static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *pte) { - *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); + if (IS_ENABLED(CONFIG_BOOKE)) + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); + else + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pte_page) { - *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT); + if (IS_ENABLED(CONFIG_BOOKE)) + *pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT); + else + *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT); } #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -#else - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, - pte_t *pte) -{ - *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pte_page) -{ - *pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT); -} - -#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -#endif static inline void pgtable_free(void *table, unsigned index_size) { From 7cec90e9499c25c31b539f8a35d949c8e9043c14 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:11 +0000 Subject: [PATCH 118/205] powerpc/mm: refactor pgtable freeing functions on nohash pgtable_free() and others are identical on nohash/32 and 64, so move them into asm/nohash/pgalloc.h Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgalloc.h | 43 ------------------- arch/powerpc/include/asm/nohash/64/pgalloc.h | 43 ------------------- arch/powerpc/include/asm/nohash/pgalloc.h | 44 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 86 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 7ee8e27070f47..6c0f5151dc1d6 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -45,47 +45,4 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) { - pte_fragment_free((unsigned long *)table, 0); - } else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - -#define get_hugepd_cache_index(x) (x) - -#ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_free_tlb(tlb, table, 0); -} #endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index ffc86d42816da..c636feced1ff0 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -72,49 +72,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } -static inline void pgtable_free(void *table, int shift) -{ - if (!shift) { - pte_fragment_free((unsigned long *)table, 0); - } else { - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(shift), table); - } -} - -#define get_hugepd_cache_index(x) (x) -#ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} - -#else -static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_free_tlb(tlb, table, 0); -} - #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) #define __pud_free_tlb(tlb, pud, addr) \ diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index 0634f29494389..4fccac6af3ad5 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -21,4 +21,48 @@ static inline void tlb_flush_pgtable(struct mmu_gather *tlb, #else #include #endif + +static inline void pgtable_free(void *table, int shift) +{ + if (!shift) { + pte_fragment_free((unsigned long *)table, 0); + } else { + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(shift), table); + } +} + +#define get_hugepd_cache_index(x) (x) + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} + +#else +static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_free_tlb(tlb, table, 0); +} #endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */ From 8a2cc87a24e8c0a823c2e4ec8702c90d743a69d4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:12 +0000 Subject: [PATCH 119/205] powerpc/mm: refactor pmd_pgtable() pmd_pgtable() is identical on the 4 subarches, refactor it. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 2 -- arch/powerpc/include/asm/book3s/64/pgalloc.h | 5 ----- arch/powerpc/include/asm/nohash/32/pgalloc.h | 2 -- arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 -- arch/powerpc/include/asm/pgalloc.h | 5 +++++ 5 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 1b9b5c2282300..9983177026305 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -37,8 +37,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, *pmdp = __pmd(__pa(pte_page) | _PMD_PRESENT); } -#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) - static inline void pgtable_free(void *table, unsigned index_size) { if (!index_size) { diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index df2dce6afe14f..053a7940504ef 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -163,11 +163,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS); } -static inline pgtable_t pmd_pgtable(pmd_t pmd) -{ - return (pgtable_t)pmd_page_vaddr(pmd); -} - static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 6c0f5151dc1d6..137761b01588e 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -43,6 +43,4 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT); } -#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) - #endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index c636feced1ff0..5a0ea63c77c7f 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -59,8 +59,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pmd_set(pmd, (unsigned long)pte_page); } -#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) - static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 5761bee0f0049..2b2c60a1a66df 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -72,4 +72,9 @@ static inline void check_pgt_cache(void) { } #include #endif +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + #endif /* _ASM_POWERPC_PGALLOC_H */ From 069239169ab060da4236a59d35aec91084cc694d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 15:58:13 +0000 Subject: [PATCH 120/205] powerpc/mm: refactor pgd_alloc() and pgd_free() on nohash pgd_alloc() and pgd_free() are identical on nohash 32 and 64. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nohash/32/pgalloc.h | 11 ----------- arch/powerpc/include/asm/nohash/64/pgalloc.h | 11 ----------- arch/powerpc/include/asm/nohash/pgalloc.h | 12 ++++++++++++ 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 137761b01588e..11eac371e7e06 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -5,17 +5,6 @@ #include #include -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), - pgtable_gfp_flags(mm, GFP_KERNEL)); -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); -} - /* * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 5a0ea63c77c7f..62321cd12da93 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -18,17 +18,6 @@ struct vmemmap_backing { }; extern struct vmemmap_backing *vmemmap_list; -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), - pgtable_gfp_flags(mm, GFP_KERNEL)); -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); -} - #define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h index 4fccac6af3ad5..332b13b4ecdb2 100644 --- a/arch/powerpc/include/asm/nohash/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_NOHASH_PGALLOC_H #include +#include extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #ifdef CONFIG_PPC64 @@ -16,6 +17,17 @@ static inline void tlb_flush_pgtable(struct mmu_gather *tlb, } #endif /* !CONFIG_PPC_BOOK3E */ +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), + pgtable_gfp_flags(mm, GFP_KERNEL)); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); +} + #ifdef CONFIG_PPC64 #include #else From d69ca6bab39e84a84781535b977c7e62c8f84d37 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:25 +0000 Subject: [PATCH 121/205] powerpc/32: Move early_init() in a separate file In preparation of KASAN, move early_init() into a separate file in order to allow deactivation of KASAN for that function. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/early_32.c | 36 ++++++++++++++++++++++++++++++++++ arch/powerpc/kernel/setup_32.c | 28 -------------------------- 3 files changed, 37 insertions(+), 29 deletions(-) create mode 100644 arch/powerpc/kernel/early_32.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index cddadccf551d9..45e47752b692f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -93,7 +93,7 @@ extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o -obj-$(CONFIG_PPC32) += entry_32.o setup_32.o +obj-$(CONFIG_PPC32) += entry_32.o setup_32.o early_32.o obj-$(CONFIG_PPC64) += dma-iommu.o iommu.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_BOOTX_TEXT) += btext.o diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c new file mode 100644 index 0000000000000..cf3cdd81dc475 --- /dev/null +++ b/arch/powerpc/kernel/early_32.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Early init before relocation + */ + +#include +#include +#include +#include +#include + +/* + * We're called here very early in the boot. + * + * Note that the kernel may be running at an address which is different + * from the address that it was linked at, so we must use RELOC/PTRRELOC + * to access static data (including strings). -- paulus + */ +notrace unsigned long __init early_init(unsigned long dt_ptr) +{ + unsigned long offset = reloc_offset(); + + /* First zero the BSS -- use memset_io, some platforms don't have caches on yet */ + memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); + + /* + * Identify the CPU type and fix up code sections + * that depend on which cpu we have. + */ + identify_cpu(offset, mfspr(SPRN_PVR)); + + apply_feature_fixups(); + + return KERNELBASE + offset; +} diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 4a65e08a6042f..3fb9f64f88fdd 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -63,34 +63,6 @@ unsigned int DMA_MODE_WRITE; EXPORT_SYMBOL(DMA_MODE_READ); EXPORT_SYMBOL(DMA_MODE_WRITE); -/* - * We're called here very early in the boot. - * - * Note that the kernel may be running at an address which is different - * from the address that it was linked at, so we must use RELOC/PTRRELOC - * to access static data (including strings). -- paulus - */ -notrace unsigned long __init early_init(unsigned long dt_ptr) -{ - unsigned long offset = reloc_offset(); - - /* First zero the BSS -- use memset_io, some platforms don't have - * caches on yet */ - memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, - __bss_stop - __bss_start); - - /* - * Identify the CPU type and fix up code sections - * that depend on which cpu we have. - */ - identify_cpu(offset, mfspr(SPRN_PVR)); - - apply_feature_fixups(); - - return KERNELBASE + offset; -} - - /* * This is run before start_kernel(), the kernel has been relocated * and we are running with enough of the MMU enabled to have our From 26deb04342e343ac58ab05bc7d2345ff0be9b667 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:26 +0000 Subject: [PATCH 122/205] powerpc: prepare string/mem functions for KASAN CONFIG_KASAN implements wrappers for memcpy() memmove() and memset() Those wrappers are doing the verification then call respectively __memcpy() __memmove() and __memset(). The arches are therefore expected to rename their optimised functions that way. For files on which KASAN is inhibited, #defines are used to allow them to directly call optimised versions of the functions without going through the KASAN wrappers. See commit 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions") for details. Other string / mem functions do not (yet) have kasan wrappers, we therefore have to fallback to the generic versions when KASAN is active, otherwise KASAN checks will be skipped. Signed-off-by: Christophe Leroy [mpe: Fixups to keep selftests working] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kasan.h | 15 +++++++++ arch/powerpc/include/asm/string.h | 32 +++++++++++++++++-- arch/powerpc/kernel/prom_init_check.sh | 10 +++++- arch/powerpc/lib/Makefile | 11 +++++-- arch/powerpc/lib/copy_32.S | 12 +++++-- arch/powerpc/lib/mem_64.S | 9 ++++-- arch/powerpc/lib/memcpy_64.S | 4 ++- .../selftests/powerpc/copyloops/asm/export.h | 1 + .../selftests/powerpc/copyloops/asm/kasan.h | 0 .../selftests/powerpc/copyloops/asm/ppc_asm.h | 1 + 10 files changed, 82 insertions(+), 13 deletions(-) create mode 100644 arch/powerpc/include/asm/kasan.h create mode 100644 tools/testing/selftests/powerpc/copyloops/asm/kasan.h diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h new file mode 100644 index 0000000000000..2c179a39d4ba5 --- /dev/null +++ b/arch/powerpc/include/asm/kasan.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_KASAN_H +#define __ASM_KASAN_H + +#ifdef CONFIG_KASAN +#define _GLOBAL_KASAN(fn) _GLOBAL(__##fn) +#define _GLOBAL_TOC_KASAN(fn) _GLOBAL_TOC(__##fn) +#define EXPORT_SYMBOL_KASAN(fn) EXPORT_SYMBOL(__##fn) +#else +#define _GLOBAL_KASAN(fn) _GLOBAL(fn) +#define _GLOBAL_TOC_KASAN(fn) _GLOBAL_TOC(fn) +#define EXPORT_SYMBOL_KASAN(fn) +#endif + +#endif diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 1647de15a31e7..9bf6dffb40900 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -4,14 +4,17 @@ #ifdef __KERNEL__ +#ifndef CONFIG_KASAN #define __HAVE_ARCH_STRNCPY #define __HAVE_ARCH_STRNCMP +#define __HAVE_ARCH_MEMCHR +#define __HAVE_ARCH_MEMCMP +#define __HAVE_ARCH_MEMSET16 +#endif + #define __HAVE_ARCH_MEMSET #define __HAVE_ARCH_MEMCPY #define __HAVE_ARCH_MEMMOVE -#define __HAVE_ARCH_MEMCMP -#define __HAVE_ARCH_MEMCHR -#define __HAVE_ARCH_MEMSET16 #define __HAVE_ARCH_MEMCPY_FLUSHCACHE extern char * strcpy(char *,const char *); @@ -27,7 +30,27 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); +void *__memset(void *s, int c, __kernel_size_t count); +void *__memcpy(void *to, const void *from, __kernel_size_t n); +void *__memmove(void *to, const void *from, __kernel_size_t n); + +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) +/* + * For files that are not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + +#endif + #ifdef CONFIG_PPC64 +#ifndef CONFIG_KASAN #define __HAVE_ARCH_MEMSET32 #define __HAVE_ARCH_MEMSET64 @@ -49,8 +72,11 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) { return __memset64(p, v, n * 8); } +#endif #else +#ifndef CONFIG_KASAN #define __HAVE_ARCH_STRLEN +#endif extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 667df97d25954..181fd10008efb 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -16,8 +16,16 @@ # If you really need to reference something from prom_init.o add # it to the list below: +grep "^CONFIG_KASAN=y$" .config >/dev/null +if [ $? -eq 0 ] +then + MEM_FUNCS="__memcpy __memset" +else + MEM_FUNCS="memcpy memset" +fi + WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush -_end enter_prom memcpy memset reloc_offset __secondary_hold +_end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 79396e184bcaa..47a4de434c22a 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -8,9 +8,14 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) -obj-y += string.o alloc.o code-patching.o feature-fixups.o +obj-y += alloc.o code-patching.o feature-fixups.o -obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o +ifndef CONFIG_KASAN +obj-y += string.o memcmp_$(BITS).o +obj-$(CONFIG_PPC32) += strlen_32.o +endif + +obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o @@ -34,7 +39,7 @@ obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ test_emulate_step_exec_instr.o obj-y += checksum_$(BITS).o checksum_wrappers.o \ - string_$(BITS).o memcmp_$(BITS).o + string_$(BITS).o obj-y += sstep.o ldstfp.o quad.o obj64-y += quad.o diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index ba66846fe973f..d5642481fb985 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -14,6 +14,7 @@ #include #include #include +#include #define COPY_16_BYTES \ lwz r7,4(r4); \ @@ -68,6 +69,7 @@ CACHELINE_BYTES = L1_CACHE_BYTES LG_CACHELINE_BYTES = L1_CACHE_SHIFT CACHELINE_MASK = (L1_CACHE_BYTES-1) +#ifndef CONFIG_KASAN _GLOBAL(memset16) rlwinm. r0 ,r5, 31, 1, 31 addi r6, r3, -4 @@ -81,6 +83,7 @@ _GLOBAL(memset16) sth r4, 4(r6) blr EXPORT_SYMBOL(memset16) +#endif /* * Use dcbz on the complete cache lines in the destination @@ -91,7 +94,7 @@ EXPORT_SYMBOL(memset16) * We therefore skip the optimised bloc that uses dcbz. This jump is * replaced by a nop once cache is active. This is done in machine_init() */ -_GLOBAL(memset) +_GLOBAL_KASAN(memset) cmplwi 0,r5,4 blt 7f @@ -151,6 +154,7 @@ _GLOBAL(memset) bdnz 9b blr EXPORT_SYMBOL(memset) +EXPORT_SYMBOL_KASAN(memset) /* * This version uses dcbz on the complete cache lines in the @@ -163,12 +167,12 @@ EXPORT_SYMBOL(memset) * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is * replaced by a nop once cache is active. This is done in machine_init() */ -_GLOBAL(memmove) +_GLOBAL_KASAN(memmove) cmplw 0,r3,r4 bgt backwards_memcpy /* fall through */ -_GLOBAL(memcpy) +_GLOBAL_KASAN(memcpy) 1: b generic_memcpy patch_site 1b, patch__memcpy_nocache @@ -244,6 +248,8 @@ _GLOBAL(memcpy) 65: blr EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(memmove) +EXPORT_SYMBOL_KASAN(memcpy) +EXPORT_SYMBOL_KASAN(memmove) generic_memcpy: srwi. r7,r5,3 diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S index 3c3be02f33b7c..7f6bd031c3062 100644 --- a/arch/powerpc/lib/mem_64.S +++ b/arch/powerpc/lib/mem_64.S @@ -12,7 +12,9 @@ #include #include #include +#include +#ifndef CONFIG_KASAN _GLOBAL(__memset16) rlwimi r4,r4,16,0,15 /* fall through */ @@ -29,8 +31,9 @@ _GLOBAL(__memset64) EXPORT_SYMBOL(__memset16) EXPORT_SYMBOL(__memset32) EXPORT_SYMBOL(__memset64) +#endif -_GLOBAL(memset) +_GLOBAL_KASAN(memset) neg r0,r3 rlwimi r4,r4,8,16,23 andi. r0,r0,7 /* # bytes to be 8-byte aligned */ @@ -96,8 +99,9 @@ _GLOBAL(memset) stb r4,0(r6) blr EXPORT_SYMBOL(memset) +EXPORT_SYMBOL_KASAN(memset) -_GLOBAL_TOC(memmove) +_GLOBAL_TOC_KASAN(memmove) cmplw 0,r3,r4 bgt backwards_memcpy b memcpy @@ -139,3 +143,4 @@ _GLOBAL(backwards_memcpy) mtctr r7 b 1b EXPORT_SYMBOL(memmove) +EXPORT_SYMBOL_KASAN(memmove) diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index 273ea67e60a13..25c3772c1dfbc 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -11,6 +11,7 @@ #include #include #include +#include #ifndef SELFTEST_CASE /* For big-endian, 0 == most CPUs, 1 == POWER6, 2 == Cell */ @@ -18,7 +19,7 @@ #endif .align 7 -_GLOBAL_TOC(memcpy) +_GLOBAL_TOC_KASAN(memcpy) BEGIN_FTR_SECTION #ifdef __LITTLE_ENDIAN__ cmpdi cr7,r5,0 @@ -230,3 +231,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) blr #endif EXPORT_SYMBOL(memcpy) +EXPORT_SYMBOL_KASAN(memcpy) diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h index 0bab35f6777a2..05c1663c89b03 100644 --- a/tools/testing/selftests/powerpc/copyloops/asm/export.h +++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h @@ -1,2 +1,3 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define EXPORT_SYMBOL(x) +#define EXPORT_SYMBOL_KASAN(x) diff --git a/tools/testing/selftests/powerpc/copyloops/asm/kasan.h b/tools/testing/selftests/powerpc/copyloops/asm/kasan.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h index 0605df8075932..58c1cef3e399d 100644 --- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h +++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h @@ -25,6 +25,7 @@ #define _GLOBAL(A) FUNC_START(test_ ## A) #define _GLOBAL_TOC(A) _GLOBAL(A) +#define _GLOBAL_TOC_KASAN(A) _GLOBAL(A) #define PPC_MTOCRF(A, B) mtocrf A, B From cbe46bd4f5104552b612505b73d366f66efc2341 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:27 +0000 Subject: [PATCH 123/205] powerpc: remove CONFIG_CMDLINE #ifdef mess This patch makes CONFIG_CMDLINE defined at all time. It avoids having to enclose related code inside #ifdef CONFIG_CMDLINE Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 6 +++--- arch/powerpc/kernel/prom_init.c | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7815eb0cc2a5b..515bd10d32f6d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -831,9 +831,9 @@ config CMDLINE_BOOL bool "Default bootloader kernel arguments" config CMDLINE - string "Initial kernel command string" - depends on CMDLINE_BOOL - default "console=ttyS0,9600 console=tty0 root=/dev/sda2" + string "Initial kernel command string" if CMDLINE_BOOL + default "console=ttyS0,9600 console=tty0 root=/dev/sda2" if CMDLINE_BOOL + default "" help On some platforms, there is currently no way for the boot loader to pass arguments to the kernel. For these platforms, you can supply diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f33ff4163a51c..ecf083c46bdb0 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -631,17 +631,14 @@ static void __init early_cmdline_parse(void) const char *opt; char *p; - int l __maybe_unused = 0; + int l = 0; prom_cmd_line[0] = 0; p = prom_cmd_line; if ((long)prom.chosen > 0) l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); -#ifdef CONFIG_CMDLINE - if (l <= 0 || p[0] == '\0') /* dbl check */ - strlcpy(prom_cmd_line, - CONFIG_CMDLINE, sizeof(prom_cmd_line)); -#endif /* CONFIG_CMDLINE */ + if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && (l <= 0 || p[0] == '\0')) /* dbl check */ + strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line)); prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 From 450e7dd4001f22f796e22422dd1d2cbd5bda21fc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:28 +0000 Subject: [PATCH 124/205] powerpc/prom_init: don't use string functions from lib/ When KASAN is active, the string functions in lib/ are doing the KASAN checks. This is too early for prom_init. This patch implements dedicated string functions for prom_init, which will be compiled in with KASAN disabled. Size of prom_init before the patch: text data bss dec hex filename 12060 488 6960 19508 4c34 arch/powerpc/kernel/prom_init.o Size of prom_init after the patch: text data bss dec hex filename 12460 488 6960 19908 4dc4 arch/powerpc/kernel/prom_init.o This increases the size of prom_init a bit, but as prom_init is in __init section, it is freed after boot anyway. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 211 ++++++++++++++++++++----- arch/powerpc/kernel/prom_init_check.sh | 2 +- 2 files changed, 171 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ecf083c46bdb0..7017156168e8a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -224,6 +224,135 @@ static bool __prombss rtas_has_query_cpu_stopped; #define PHANDLE_VALID(p) ((p) != 0 && (p) != PROM_ERROR) #define IHANDLE_VALID(i) ((i) != 0 && (i) != PROM_ERROR) +/* Copied from lib/string.c and lib/kstrtox.c */ + +static int __init prom_strcmp(const char *cs, const char *ct) +{ + unsigned char c1, c2; + + while (1) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + } + return 0; +} + +static char __init *prom_strcpy(char *dest, const char *src) +{ + char *tmp = dest; + + while ((*dest++ = *src++) != '\0') + /* nothing */; + return tmp; +} + +static int __init prom_strncmp(const char *cs, const char *ct, size_t count) +{ + unsigned char c1, c2; + + while (count) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + count--; + } + return 0; +} + +static size_t __init prom_strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} + +static int __init prom_memcmp(const void *cs, const void *ct, size_t count) +{ + const unsigned char *su1, *su2; + int res = 0; + + for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) + if ((res = *su1 - *su2) != 0) + break; + return res; +} + +static char __init *prom_strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = prom_strlen(s2); + if (!l2) + return (char *)s1; + l1 = prom_strlen(s1); + while (l1 >= l2) { + l1--; + if (!prom_memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} + +static size_t __init prom_strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = prom_strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + return ret; +} + +#ifdef CONFIG_PPC_PSERIES +static int __init prom_strtobool(const char *s, bool *res) +{ + if (!s) + return -EINVAL; + + switch (s[0]) { + case 'y': + case 'Y': + case '1': + *res = true; + return 0; + case 'n': + case 'N': + case '0': + *res = false; + return 0; + case 'o': + case 'O': + switch (s[1]) { + case 'n': + case 'N': + *res = true; + return 0; + case 'f': + case 'F': + *res = false; + return 0; + default: + break; + } + default: + break; + } + + return -EINVAL; +} +#endif /* This is the one and *ONLY* place where we actually call open * firmware. @@ -555,7 +684,7 @@ static int __init prom_setprop(phandle node, const char *nodename, add_string(&p, tohex((u32)(unsigned long) value)); add_string(&p, tohex(valuelen)); add_string(&p, tohex(ADDR(pname))); - add_string(&p, tohex(strlen(pname))); + add_string(&p, tohex(prom_strlen(pname))); add_string(&p, "property"); *p = 0; return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); @@ -638,23 +767,23 @@ static void __init early_cmdline_parse(void) if ((long)prom.chosen > 0) l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && (l <= 0 || p[0] == '\0')) /* dbl check */ - strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line)); + prom_strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line)); prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 - opt = strstr(prom_cmd_line, "iommu="); + opt = prom_strstr(prom_cmd_line, "iommu="); if (opt) { prom_printf("iommu opt is: %s\n", opt); opt += 6; while (*opt && *opt == ' ') opt++; - if (!strncmp(opt, "off", 3)) + if (!prom_strncmp(opt, "off", 3)) prom_iommu_off = 1; - else if (!strncmp(opt, "force", 5)) + else if (!prom_strncmp(opt, "force", 5)) prom_iommu_force_on = 1; } #endif - opt = strstr(prom_cmd_line, "mem="); + opt = prom_strstr(prom_cmd_line, "mem="); if (opt) { opt += 4; prom_memory_limit = prom_memparse(opt, (const char **)&opt); @@ -666,13 +795,13 @@ static void __init early_cmdline_parse(void) #ifdef CONFIG_PPC_PSERIES prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); - opt = strstr(prom_cmd_line, "disable_radix"); + opt = prom_strstr(prom_cmd_line, "disable_radix"); if (opt) { opt += 13; if (*opt && *opt == '=') { bool val; - if (kstrtobool(++opt, &val)) + if (prom_strtobool(++opt, &val)) prom_radix_disable = false; else prom_radix_disable = val; @@ -1025,7 +1154,7 @@ static int __init prom_count_smt_threads(void) type[0] = 0; prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, "cpu")) + if (prom_strcmp(type, "cpu")) continue; /* * There is an entry for each smt thread, each entry being @@ -1472,7 +1601,7 @@ static void __init prom_init_mem(void) */ prom_getprop(node, "name", type, sizeof(type)); } - if (strcmp(type, "memory")) + if (prom_strcmp(type, "memory")) continue; plen = prom_getprop(node, "reg", regbuf, sizeof(regbuf)); @@ -1753,19 +1882,19 @@ static void __init prom_initialize_tce_table(void) prom_getprop(node, "device_type", type, sizeof(type)); prom_getprop(node, "model", model, sizeof(model)); - if ((type[0] == 0) || (strstr(type, "pci") == NULL)) + if ((type[0] == 0) || (prom_strstr(type, "pci") == NULL)) continue; /* Keep the old logic intact to avoid regression. */ if (compatible[0] != 0) { - if ((strstr(compatible, "python") == NULL) && - (strstr(compatible, "Speedwagon") == NULL) && - (strstr(compatible, "Winnipeg") == NULL)) + if ((prom_strstr(compatible, "python") == NULL) && + (prom_strstr(compatible, "Speedwagon") == NULL) && + (prom_strstr(compatible, "Winnipeg") == NULL)) continue; } else if (model[0] != 0) { - if ((strstr(model, "ython") == NULL) && - (strstr(model, "peedwagon") == NULL) && - (strstr(model, "innipeg") == NULL)) + if ((prom_strstr(model, "ython") == NULL) && + (prom_strstr(model, "peedwagon") == NULL) && + (prom_strstr(model, "innipeg") == NULL)) continue; } @@ -1914,12 +2043,12 @@ static void __init prom_hold_cpus(void) type[0] = 0; prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, "cpu") != 0) + if (prom_strcmp(type, "cpu") != 0) continue; /* Skip non-configured cpus. */ if (prom_getprop(node, "status", type, sizeof(type)) > 0) - if (strcmp(type, "okay") != 0) + if (prom_strcmp(type, "okay") != 0) continue; reg = cpu_to_be32(-1); /* make sparse happy */ @@ -1995,9 +2124,9 @@ static void __init prom_find_mmu(void) return; version[sizeof(version) - 1] = 0; /* XXX might need to add other versions here */ - if (strcmp(version, "Open Firmware, 1.0.5") == 0) + if (prom_strcmp(version, "Open Firmware, 1.0.5") == 0) of_workarounds = OF_WA_CLAIM; - else if (strncmp(version, "FirmWorks,3.", 12) == 0) { + else if (prom_strncmp(version, "FirmWorks,3.", 12) == 0) { of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL; call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim"); } else @@ -2030,7 +2159,7 @@ static void __init prom_init_stdout(void) call_prom("instance-to-path", 3, 1, prom.stdout, path, 255); prom_printf("OF stdout device is: %s\n", of_stdout_device); prom_setprop(prom.chosen, "/chosen", "linux,stdout-path", - path, strlen(path) + 1); + path, prom_strlen(path) + 1); /* instance-to-package fails on PA-Semi */ stdout_node = call_prom("instance-to-package", 1, 1, prom.stdout); @@ -2040,7 +2169,7 @@ static void __init prom_init_stdout(void) /* If it's a display, note it */ memset(type, 0, sizeof(type)); prom_getprop(stdout_node, "device_type", type, sizeof(type)); - if (strcmp(type, "display") == 0) + if (prom_strcmp(type, "display") == 0) prom_setprop(stdout_node, path, "linux,boot-display", NULL, 0); } } @@ -2061,19 +2190,19 @@ static int __init prom_find_machine_type(void) compat[len] = 0; while (i < len) { char *p = &compat[i]; - int sl = strlen(p); + int sl = prom_strlen(p); if (sl == 0) break; - if (strstr(p, "Power Macintosh") || - strstr(p, "MacRISC")) + if (prom_strstr(p, "Power Macintosh") || + prom_strstr(p, "MacRISC")) return PLATFORM_POWERMAC; #ifdef CONFIG_PPC64 /* We must make sure we don't detect the IBM Cell * blades as pSeries due to some firmware issues, * so we do it here. */ - if (strstr(p, "IBM,CBEA") || - strstr(p, "IBM,CPBW-1.0")) + if (prom_strstr(p, "IBM,CBEA") || + prom_strstr(p, "IBM,CPBW-1.0")) return PLATFORM_GENERIC; #endif /* CONFIG_PPC64 */ i += sl + 1; @@ -2090,7 +2219,7 @@ static int __init prom_find_machine_type(void) compat, sizeof(compat)-1); if (len <= 0) return PLATFORM_GENERIC; - if (strcmp(compat, "chrp")) + if (prom_strcmp(compat, "chrp")) return PLATFORM_GENERIC; /* Default to pSeries. We need to know if we are running LPAR */ @@ -2152,7 +2281,7 @@ static void __init prom_check_displays(void) for (node = 0; prom_next_node(&node); ) { memset(type, 0, sizeof(type)); prom_getprop(node, "device_type", type, sizeof(type)); - if (strcmp(type, "display") != 0) + if (prom_strcmp(type, "display") != 0) continue; /* It seems OF doesn't null-terminate the path :-( */ @@ -2256,9 +2385,9 @@ static unsigned long __init dt_find_string(char *str) s = os = (char *)dt_string_start; s += 4; while (s < (char *)dt_string_end) { - if (strcmp(s, str) == 0) + if (prom_strcmp(s, str) == 0) return s - os; - s += strlen(s) + 1; + s += prom_strlen(s) + 1; } return 0; } @@ -2291,7 +2420,7 @@ static void __init scan_dt_build_strings(phandle node, } /* skip "name" */ - if (strcmp(namep, "name") == 0) { + if (prom_strcmp(namep, "name") == 0) { *mem_start = (unsigned long)namep; prev_name = "name"; continue; @@ -2303,7 +2432,7 @@ static void __init scan_dt_build_strings(phandle node, namep = sstart + soff; } else { /* Trim off some if we can */ - *mem_start = (unsigned long)namep + strlen(namep) + 1; + *mem_start = (unsigned long)namep + prom_strlen(namep) + 1; dt_string_end = *mem_start; } prev_name = namep; @@ -2372,7 +2501,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, break; /* skip "name" */ - if (strcmp(pname, "name") == 0) { + if (prom_strcmp(pname, "name") == 0) { prev_name = "name"; continue; } @@ -2403,7 +2532,7 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, call_prom("getprop", 4, 1, node, pname, valp, l); *mem_start = _ALIGN(*mem_start, 4); - if (!strcmp(pname, "phandle")) + if (!prom_strcmp(pname, "phandle")) has_phandle = 1; } @@ -2473,8 +2602,8 @@ static void __init flatten_device_tree(void) /* Add "phandle" in there, we'll need it */ namep = make_room(&mem_start, &mem_end, 16, 1); - strcpy(namep, "phandle"); - mem_start = (unsigned long)namep + strlen(namep) + 1; + prom_strcpy(namep, "phandle"); + mem_start = (unsigned long)namep + prom_strlen(namep) + 1; /* Build string array */ prom_printf("Building dt strings...\n"); @@ -2796,7 +2925,7 @@ static void __init fixup_device_tree_efika(void) rv = prom_getprop(node, "model", prop, sizeof(prop)); if (rv == PROM_ERROR) return; - if (strcmp(prop, "EFIKA5K2")) + if (prom_strcmp(prop, "EFIKA5K2")) return; prom_printf("Applying EFIKA device tree fixups\n"); @@ -2804,13 +2933,13 @@ static void __init fixup_device_tree_efika(void) /* Claiming to be 'chrp' is death */ node = call_prom("finddevice", 1, 1, ADDR("/")); rv = prom_getprop(node, "device_type", prop, sizeof(prop)); - if (rv != PROM_ERROR && (strcmp(prop, "chrp") == 0)) + if (rv != PROM_ERROR && (prom_strcmp(prop, "chrp") == 0)) prom_setprop(node, "/", "device_type", "efika", sizeof("efika")); /* CODEGEN,description is exposed in /proc/cpuinfo so fix that too */ rv = prom_getprop(node, "CODEGEN,description", prop, sizeof(prop)); - if (rv != PROM_ERROR && (strstr(prop, "CHRP"))) + if (rv != PROM_ERROR && (prom_strstr(prop, "CHRP"))) prom_setprop(node, "/", "CODEGEN,description", "Efika 5200B PowerPC System", sizeof("Efika 5200B PowerPC System")); diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 181fd10008efb..4cac45cb5de53 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -27,7 +27,7 @@ fi WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start -strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224 +logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." From adcf59187e2705721ccf23733a5fa2fb20d91415 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:29 +0000 Subject: [PATCH 125/205] powerpc: don't use direct assignation during early boot. In kernel/cputable.c, explicitly use memcpy() instead of *y = *x; This will allow GCC to replace it with __memcpy() when KASAN is selected. Acked-by: Dmitry Vyukov Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/cputable.c | 13 ++++++++++--- arch/powerpc/kernel/prom_init.c | 10 ++++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 1eab54bc6ee93..cd12f362b61ff 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -2147,7 +2147,11 @@ void __init set_cur_cpu_spec(struct cpu_spec *s) struct cpu_spec *t = &the_cpu_spec; t = PTRRELOC(t); - *t = *s; + /* + * use memcpy() instead of *t = *s so that GCC replaces it + * by __memcpy() when KASAN is active + */ + memcpy(t, s, sizeof(*t)); *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; } @@ -2161,8 +2165,11 @@ static struct cpu_spec * __init setup_cpu_spec(unsigned long offset, t = PTRRELOC(t); old = *t; - /* Copy everything, then do fixups */ - *t = *s; + /* + * Copy everything, then do fixups. Use memcpy() instead of *t = *s + * so that GCC replaces it by __memcpy() when KASAN is active + */ + memcpy(t, s, sizeof(*t)); /* * If we are overriding a previous value derived from the real diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 7017156168e8a..d3b0d543d9243 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1264,8 +1264,14 @@ static void __init prom_check_platform_support(void) int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); - /* First copy the architecture vec template */ - ibm_architecture_vec = ibm_architecture_vec_template; + /* + * First copy the architecture vec template + * + * use memcpy() instead of *vec = *vec_template so that GCC replaces it + * by __memcpy() when KASAN is active + */ + memcpy(&ibm_architecture_vec, &ibm_architecture_vec_template, + sizeof(ibm_architecture_vec)); if (prop_len > 1) { int i; From 7934cea7f0b93fcfdb3b175df94f539e4af86c9b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:30 +0000 Subject: [PATCH 126/205] powerpc/32: use memset() instead of memset_io() to zero BSS Since commit 400c47d81ca38 ("powerpc32: memset: only use dcbz once cache is enabled"), memset() can be used before activation of the cache, so no need to use memset_io() for zeroing the BSS. Acked-by: Dmitry Vyukov Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/early_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c index cf3cdd81dc475..3482118ffe763 100644 --- a/arch/powerpc/kernel/early_32.c +++ b/arch/powerpc/kernel/early_32.c @@ -21,8 +21,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) { unsigned long offset = reloc_offset(); - /* First zero the BSS -- use memset_io, some platforms don't have caches on yet */ - memset_io((void __iomem *)PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); + /* First zero the BSS */ + memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); /* * Identify the CPU type and fix up code sections From a67beca077ef79e971443aa6af6b14d4b3fb3bd6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:31 +0000 Subject: [PATCH 127/205] powerpc/32: make KVIRT_TOP dependent on FIXMAP_START When we add KASAN shadow area, KVIRT_TOP can't be anymore fixed at 0xfe000000. This patch uses FIXADDR_START to define KVIRT_TOP. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgtable.h | 13 ++++++++++--- arch/powerpc/include/asm/nohash/32/pgtable.h | 13 ++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index aa8406b8f7ba0..838de59f6754b 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -134,15 +134,24 @@ static inline bool pte_user(pte_t pte) #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) + +#ifndef __ASSEMBLY__ + +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); + +#endif /* !__ASSEMBLY__ */ + /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary * value (for now) on others, from where we can start layout kernel * virtual space that goes below PKMAP and FIXMAP */ +#include + #ifdef CONFIG_HIGHMEM #define KVIRT_TOP PKMAP_BASE #else -#define KVIRT_TOP (0xfe000000UL) /* for now, could be FIXMAP_BASE ? */ +#define KVIRT_TOP FIXADDR_START #endif /* @@ -373,8 +382,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); - /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_read(pte_t pte) { return 1; } diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index bed4333582606..0284f8f5305f8 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -64,15 +64,24 @@ extern int icache_44x_need_flush; #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) +#ifndef __ASSEMBLY__ + +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); + +#endif /* !__ASSEMBLY__ */ + + /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary * value (for now) on others, from where we can start layout kernel * virtual space that goes below PKMAP and FIXMAP */ +#include + #ifdef CONFIG_HIGHMEM #define KVIRT_TOP PKMAP_BASE #else -#define KVIRT_TOP (0xfe000000UL) /* for now, could be FIXMAP_BASE ? */ +#define KVIRT_TOP FIXADDR_START #endif /* @@ -379,8 +388,6 @@ static inline int pte_young(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); - #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ From b4abe38fd698ace6942edeeb79a5b8a60a7af4fa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:32 +0000 Subject: [PATCH 128/205] powerpc/32: prepare shadow area for KASAN This patch prepares a shadow area for KASAN. The shadow area will be at the top of the kernel virtual memory space above the fixmap area and will occupy one eighth of the total kernel virtual memory space. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 5 +++++ arch/powerpc/include/asm/fixmap.h | 5 +++++ arch/powerpc/include/asm/kasan.h | 16 ++++++++++++++++ arch/powerpc/mm/mem.c | 4 ++++ arch/powerpc/mm/ptdump/ptdump.c | 8 ++++++++ 5 files changed, 38 insertions(+) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 4e00cb0a54646..61febbbdd02b3 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -366,3 +366,8 @@ config PPC_FAST_ENDIAN_SWITCH depends on DEBUG_KERNEL && PPC_BOOK3S_64 help If you're unsure what this is, say N. + +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0xe0000000 diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index b9fbed84ddca4..0cfc365d814ba 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -22,7 +22,12 @@ #include #endif +#ifdef CONFIG_KASAN +#include +#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) +#else #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +#endif /* * Here we define all the compile-time 'special' virtual diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 2c179a39d4ba5..05274dea31091 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -12,4 +12,20 @@ #define EXPORT_SYMBOL_KASAN(fn) #endif +#ifndef __ASSEMBLY__ + +#include + +#define KASAN_SHADOW_SCALE_SHIFT 3 + +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ + (PAGE_OFFSET >> KASAN_SHADOW_SCALE_SHIFT)) + +#define KASAN_SHADOW_OFFSET ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET) + +#define KASAN_SHADOW_END 0UL + +#define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START) + +#endif /* __ASSEMBLY */ #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 105c58f8900aa..cd525d709072a 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -310,6 +310,10 @@ void __init mem_init(void) mem_init_print_info(NULL); #ifdef CONFIG_PPC32 pr_info("Kernel virtual memory layout:\n"); +#ifdef CONFIG_KASAN + pr_info(" * 0x%08lx..0x%08lx : kasan shadow mem\n", + KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); #ifdef CONFIG_HIGHMEM pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 63fc56feea15b..48135ba6fa74d 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -100,6 +100,10 @@ static struct addr_marker address_markers[] = { #endif { 0, "Fixmap start" }, { 0, "Fixmap end" }, +#endif +#ifdef CONFIG_KASAN + { 0, "kasan shadow mem start" }, + { 0, "kasan shadow mem end" }, #endif { -1, NULL }, }; @@ -323,6 +327,10 @@ static void populate_markers(void) #endif address_markers[i++].start_address = FIXADDR_START; address_markers[i++].start_address = FIXADDR_TOP; +#ifdef CONFIG_KASAN + address_markers[i++].start_address = KASAN_SHADOW_START; + address_markers[i++].start_address = KASAN_SHADOW_END; +#endif #endif /* CONFIG_PPC64 */ } From f072015c7b742c42aa5649d22f43163cd0eb7024 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:33 +0000 Subject: [PATCH 129/205] powerpc: disable KASAN instrumentation on early/critical files. All files containing functions run before kasan_early_init() is called must have KASAN instrumentation disabled. For those file, branch profiling also have to be disabled otherwise each if () generates a call to ftrace_likely_update(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 12 ++++++++++++ arch/powerpc/lib/Makefile | 8 ++++++++ arch/powerpc/mm/Makefile | 6 ++++++ arch/powerpc/platforms/powermac/Makefile | 6 ++++++ arch/powerpc/purgatory/Makefile | 3 +++ arch/powerpc/xmon/Makefile | 1 + 6 files changed, 36 insertions(+) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 45e47752b692f..0ea6c4aa3a20d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -31,6 +31,18 @@ CFLAGS_REMOVE_btext.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = $(CC_FLAGS_FTRACE) endif +KASAN_SANITIZE_early_32.o := n +KASAN_SANITIZE_cputable.o := n +KASAN_SANITIZE_prom_init.o := n +KASAN_SANITIZE_btext.o := n + +ifdef CONFIG_KASAN +CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ process.o systbl.o idle.o \ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 47a4de434c22a..c55f9c27bf798 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -8,6 +8,14 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) +KASAN_SANITIZE_code-patching.o := n +KASAN_SANITIZE_feature-fixups.o := n + +ifdef CONFIG_KASAN +CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += alloc.o code-patching.o feature-fixups.o ifndef CONFIG_KASAN diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index c7d5f37f7c52c..62735f335bce3 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,6 +5,12 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +KASAN_SANITIZE_ppc_mmu_32.o := n + +ifdef CONFIG_KASAN +CFLAGS_ppc_mmu_32.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o \ diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 20ebf35d79133..f4247ade71caf 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -2,6 +2,12 @@ CFLAGS_bootx_init.o += -fPIC CFLAGS_bootx_init.o += $(call cc-option, -fno-stack-protector) +KASAN_SANITIZE_bootx_init.o := n + +ifdef CONFIG_KASAN +CFLAGS_bootx_init.o += -DDISABLE_BRANCH_PROFILING +endif + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_bootx_init.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile index 4314ba5baf435..7c6d8b14f4403 100644 --- a/arch/powerpc/purgatory/Makefile +++ b/arch/powerpc/purgatory/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n + targets += trampoline.o purgatory.ro kexec-purgatory.c LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 3050f9323254f..f142570ad8606 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,6 +7,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n +KASAN_SANITIZE := n # Disable ftrace for the entire directory ORIG_CFLAGS := $(KBUILD_CFLAGS) From 2edb16efc899f9c232e2d880930b855e4cf55df4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:34 +0000 Subject: [PATCH 130/205] powerpc/32: Add KASAN support This patch adds KASAN support for PPC32. The following patch will add an early activation of hash table for book3s. Until then, a warning will be raised if trying to use KASAN on an hash 6xx. To support KASAN, this patch initialises that MMU mapings for accessing to the KASAN shadow area defined in a previous patch. An early mapping is set as soon as the kernel code has been relocated at its definitive place. Then the definitive mapping is set once paging is initialised. For modules, the shadow area is allocated at module_alloc(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/kasan.h | 9 ++ arch/powerpc/kernel/head_32.S | 3 + arch/powerpc/kernel/head_40x.S | 3 + arch/powerpc/kernel/head_44x.S | 3 + arch/powerpc/kernel/head_8xx.S | 3 + arch/powerpc/kernel/head_fsl_booke.S | 3 + arch/powerpc/kernel/setup-common.c | 3 + arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/init_32.c | 3 + arch/powerpc/mm/kasan/kasan_init_32.c | 156 ++++++++++++++++++++++++++ 11 files changed, 188 insertions(+) create mode 100644 arch/powerpc/mm/kasan/kasan_init_32.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 515bd10d32f6d..2711aac24621b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -173,6 +173,7 @@ config PPC select GENERIC_TIME_VSYSCALL select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL + select HAVE_ARCH_KASAN if PPC32 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 05274dea31091..296e51c2f066b 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -27,5 +27,14 @@ #define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START) +#ifdef CONFIG_KASAN +void kasan_early_init(void); +void kasan_mmu_init(void); +void kasan_init(void); +#else +static inline void kasan_init(void) { } +static inline void kasan_mmu_init(void) { } +#endif + #endif /* __ASSEMBLY */ #endif diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 40aec3f00a052..6e85171e513c8 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -969,6 +969,9 @@ start_here: * Do early platform-specific initialization, * and set up the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index a9c934f2319b5..efa219d2136e7 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -848,6 +848,9 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 37117ab11584c..34a5df827b38e 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -203,6 +203,9 @@ _ENTRY(_start); /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 03c73b4c64359..d25adb6ef2350 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -853,6 +853,9 @@ start_here: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif li r3,0 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 32332e24e4216..567e0ed45ca81 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -268,6 +268,9 @@ set_ivor: /* * Decide what sort of machine this is and initialize the MMU. */ +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif mr r3,r30 mr r4,r31 bl machine_init diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 19d68a9b5f372..91d2c6970bdb9 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -67,6 +67,7 @@ #include #include #include +#include #include "setup.h" @@ -871,6 +872,8 @@ static void smp_setup_pacas(void) */ void __init setup_arch(char **cmdline_p) { + kasan_init(); + *cmdline_p = boot_command_line; /* Set a half-reasonable default so udelay does something sensible */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 62735f335bce3..d8c0ce9b25576 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -26,3 +26,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3eb4cb09749c6..c3121b6c8cbdf 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -179,6 +180,8 @@ void __init MMU_init(void) btext_unmap(); #endif + kasan_mmu_init(); + setup_kup(); /* Shortly after that, the entire linear mapping will be available */ diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c new file mode 100644 index 0000000000000..42617fcad8289 --- /dev/null +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define DISABLE_BRANCH_PROFILING + +#include +#include +#include +#include +#include +#include +#include +#include + +static void kasan_populate_pte(pte_t *ptep, pgprot_t prot) +{ + unsigned long va = (unsigned long)kasan_early_shadow_page; + phys_addr_t pa = __pa(kasan_early_shadow_page); + int i; + + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0); +} + +static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_end) +{ + pmd_t *pmd; + unsigned long k_cur, k_next; + + pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); + + for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { + pte_t *new; + + k_next = pgd_addr_end(k_cur, k_end); + if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte) + continue; + + new = pte_alloc_one_kernel(&init_mm); + + if (!new) + return -ENOMEM; + kasan_populate_pte(new, PAGE_KERNEL_RO); + pmd_populate_kernel(&init_mm, pmd, new); + } + return 0; +} + +static void __ref *kasan_get_one_page(void) +{ + if (slab_is_available()) + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static int __ref kasan_init_region(void *start, size_t size) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); + unsigned long k_cur; + int ret; + void *block = NULL; + + ret = kasan_init_shadow_page_tables(k_start, k_end); + if (ret) + return ret; + + if (!slab_is_available()) + block = memblock_alloc(k_end - k_start, PAGE_SIZE); + + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + void *va = block ? block + k_cur - k_start : kasan_get_one_page(); + pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + + if (!va) + return -ENOMEM; + + __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); + } + flush_tlb_kernel_range(k_start, k_end); + return 0; +} + +static void __init kasan_remap_early_shadow_ro(void) +{ + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + + flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); +} + +void __init kasan_mmu_init(void) +{ + int ret; + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); + + if (base >= top) + continue; + + ret = kasan_init_region(__va(base), top - base); + if (ret) + panic("kasan: kasan_init_region() failed"); + } +} + +void __init kasan_init(void) +{ + kasan_remap_early_shadow_ro(); + + clear_page(kasan_early_shadow_page); + + /* At this point kasan is fully initialized. Enable error messages */ + init_task.kasan_depth = 0; + pr_info("KASAN init done\n"); +} + +#ifdef CONFIG_MODULES +void *module_alloc(unsigned long size) +{ + void *base = vmalloc_exec(size); + + if (!base) + return NULL; + + if (!kasan_init_region(base, size)) + return base; + + vfree(base); + + return NULL; +} +#endif + +void __init kasan_early_init(void) +{ + unsigned long addr = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + unsigned long next; + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr); + + BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); + + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL); + + do { + next = pgd_addr_end(addr, end); + pmd_populate_kernel(&init_mm, pmd, kasan_early_shadow_pte); + } while (pmd++, addr = next, addr != end); + + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + WARN(true, "KASAN not supported on hash 6xx"); +} From 72f208c6a8f7bc78ef5248babd9e6ed6302bd2a0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:35 +0000 Subject: [PATCH 131/205] powerpc/32s: move hash code patching out of MMU_init_hw() For KASAN, hash table handling will be activated early for accessing to KASAN shadow areas. In order to avoid any modification of the hash functions while they are still used with the early hash table, the code patching is moved out of MMU_init_hw() and put close to the big-bang switch to the final hash table. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 3 +++ arch/powerpc/mm/book3s32/mmu.c | 36 +++++++++++++++++++++------------- arch/powerpc/mm/mmu_decl.h | 1 + 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 6e85171e513c8..5958ea6859686 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -977,6 +977,9 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init +BEGIN_MMU_FTR_SECTION + bl MMU_init_hw_patch +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) /* * Go back to running unmapped so we can load up new values diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 1db55159031ce..165529cc9087c 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -39,6 +39,7 @@ struct hash_pte *Hash, *Hash_end; unsigned long Hash_size, Hash_mask; unsigned long _SDR1; +static unsigned int hash_mb, hash_mb2; struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ @@ -308,7 +309,6 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, */ void __init MMU_init_hw(void) { - unsigned int hmask, mb, mb2; unsigned int n_hpteg, lg_n_hpteg; if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) @@ -351,20 +351,30 @@ void __init MMU_init_hw(void) (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); - /* - * Patch up the instructions in hashtable.S:create_hpte - */ - if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); Hash_mask = n_hpteg - 1; - hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; if (lg_n_hpteg > 16) - mb2 = 16 - LG_HPTEG_SIZE; + hash_mb2 = 16 - LG_HPTEG_SIZE; +} + +void __init MMU_init_hw_patch(void) +{ + unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + if (ppc_md.progress) + ppc_md.progress("hash:patch", 0x345); + if (ppc_md.progress) + ppc_md.progress("hash:done", 0x205); + + /* WARNING: Make sure nothing can trigger a KASAN check past this point */ + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ modify_instruction_site(&patch__hash_page_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); modify_instruction_site(&patch__hash_page_C, 0xffff, hmask); @@ -373,11 +383,9 @@ void __init MMU_init_hw(void) */ modify_instruction_site(&patch__flush_hash_A0, 0xffff, ((unsigned int)Hash - PAGE_OFFSET) >> 16); - modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); - modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); + modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6); + modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); - - if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); } void setup_initial_memory_limit(phys_addr_t first_memblock_base, diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 74ff61dabcb1d..d726ff7760546 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -130,6 +130,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); +void MMU_init_hw_patch(void); unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif From 215b823707ce4e8e52b106915f70357fa474c669 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:36 +0000 Subject: [PATCH 132/205] powerpc/32s: set up an early static hash table for KASAN. KASAN requires early activation of hash table, before memblock() functions are available. This patch implements an early hash_table statically defined in __initdata. During early boot, a single page table is used. For hash32, when doing the final init, one page table is allocated for each PGD entry because of the _PAGE_HASHPTE flag which can't be common to several virt pages. This is done after memblock get available but before switching to the final hash table, otherwise there are issues with TLB flushing due to the shared entries. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 70 +++++++++++++++++---------- arch/powerpc/mm/kasan/kasan_init_32.c | 23 ++++++++- arch/powerpc/mm/mmu_decl.h | 1 + 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 5958ea6859686..73288df1c5d66 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -160,6 +160,10 @@ __after_mmu_off: bl flush_tlbs bl initial_bats + bl load_segment_registers +#ifdef CONFIG_KASAN + bl early_hash_table +#endif #if defined(CONFIG_BOOTX_TEXT) bl setup_disp_bat #endif @@ -205,7 +209,7 @@ __after_mmu_off: */ turn_on_mmu: mfmsr r0 - ori r0,r0,MSR_DR|MSR_IR + ori r0,r0,MSR_DR|MSR_IR|MSR_RI mtspr SPRN_SRR1,r0 lis r0,start_here@h ori r0,r0,start_here@l @@ -884,11 +888,24 @@ _ENTRY(__restore_cpu_setup) blr #endif /* !defined(CONFIG_PPC_BOOK3S_32) */ - /* * Load stuff into the MMU. Intended to be called with * IR=0 and DR=0. */ +#ifdef CONFIG_KASAN +early_hash_table: + sync /* Force all PTE updates to finish */ + isync + tlbia /* Clear all TLB entries */ + sync /* wait for tlbia/tlbie to finish */ + TLBSYNC /* ... on all CPUs */ + /* Load the SDR1 register (hash table base & size) */ + lis r6, early_hash - PAGE_OFFSET@h + ori r6, r6, 3 /* 256kB table */ + mtspr SPRN_SDR1, r6 + blr +#endif + load_up_mmu: sync /* Force all PTE updates to finish */ isync @@ -900,29 +917,6 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0, NUM_USER_SEGMENTS /* load up user segment register values */ - mtctr r0 /* for context 0 */ - li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ -#ifdef CONFIG_PPC_KUEP - oris r3, r3, SR_NX@h /* Set Nx */ -#endif -#ifdef CONFIG_PPC_KUAP - oris r3, r3, SR_KS@h /* Set Ks */ -#endif - li r4,0 -3: mtsrin r3,r4 - addi r3,r3,0x111 /* increment VSID */ - addis r4,r4,0x1000 /* address of next segment */ - bdnz 3b - li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ - mtctr r0 /* for context 0 */ - rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ - rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ - oris r3, r3, SR_KP@h /* Kp = 1 */ -3: mtsrin r3, r4 - addi r3, r3, 0x111 /* increment VSID */ - addis r4, r4, 0x1000 /* address of next segment */ - bdnz 3b /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ @@ -944,6 +938,32 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +load_segment_registers: + li r0, NUM_USER_SEGMENTS /* load up user segment register values */ + mtctr r0 /* for context 0 */ + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ +#ifdef CONFIG_PPC_KUEP + oris r3, r3, SR_NX@h /* Set Nx */ +#endif +#ifdef CONFIG_PPC_KUAP + oris r3, r3, SR_KS@h /* Set Ks */ +#endif + li r4, 0 +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b + li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ + mtctr r0 /* for context 0 */ + rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + rlwinm r3, r3, 0, ~SR_KS /* Ks = 0 */ + oris r3, r3, SR_KP@h /* Kp = 1 */ +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b + blr + /* * This is where the main kernel code starts. */ diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 42617fcad8289..ba83614870751 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -94,6 +94,13 @@ void __init kasan_mmu_init(void) int ret; struct memblock_region *reg; + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, KASAN_SHADOW_END); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + for_each_memblock(memory, reg) { phys_addr_t base = reg->base; phys_addr_t top = min(base + reg->size, total_lowmem); @@ -135,6 +142,20 @@ void *module_alloc(unsigned long size) } #endif +#ifdef CONFIG_PPC_BOOK3S_32 +u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; + +static void __init kasan_early_hash_table(void) +{ + modify_instruction_site(&patch__hash_page_A0, 0xffff, __pa(early_hash) >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, __pa(early_hash) >> 16); + + Hash = (struct hash_pte *)early_hash; +} +#else +static void __init kasan_early_hash_table(void) {} +#endif + void __init kasan_early_init(void) { unsigned long addr = KASAN_SHADOW_START; @@ -152,5 +173,5 @@ void __init kasan_early_init(void) } while (pmd++, addr = next, addr != end); if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) - WARN(true, "KASAN not supported on hash 6xx"); + kasan_early_hash_table(); } diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d726ff7760546..31fce3914ddc3 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -106,6 +106,7 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; +extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ From da3a3b0a0e38377c98946420acdc7d4ca38cff47 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:23:37 +0000 Subject: [PATCH 133/205] powerpc/32s: map kasan zero shadow with PAGE_READONLY instead of PAGE_KERNEL_RO For hash32, the zero shadow page gets mapped with PAGE_READONLY instead of PAGE_KERNEL_RO, because the PP bits don't provide a RO kernel, so PAGE_KERNEL_RO is equivalent to PAGE_KERNEL. By using PAGE_READONLY, the page is RO for both kernel and user, but this is not a security issue as it contains only zeroes. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/kasan/kasan_init_32.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index ba83614870751..0d62be3cba47b 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -39,7 +39,10 @@ static int kasan_init_shadow_page_tables(unsigned long k_start, unsigned long k_ if (!new) return -ENOMEM; - kasan_populate_pte(new, PAGE_KERNEL_RO); + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + kasan_populate_pte(new, PAGE_READONLY); + else + kasan_populate_pte(new, PAGE_KERNEL_RO); pmd_populate_kernel(&init_mm, pmd, new); } return 0; @@ -84,7 +87,10 @@ static int __ref kasan_init_region(void *start, size_t size) static void __init kasan_remap_early_shadow_ro(void) { - kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + kasan_populate_pte(kasan_early_shadow_pte, PAGE_READONLY); + else + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL_RO); flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } From 57e0491b58fa2a217029b696511499008852a642 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:36 +0000 Subject: [PATCH 134/205] powerpc/32s: drop Hash_end Hash_end has never been used, drop it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/book3s32/mmu.c | 4 +--- arch/powerpc/mm/mmu_decl.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 165529cc9087c..03265de05637d 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -36,7 +36,7 @@ #include -struct hash_pte *Hash, *Hash_end; +struct hash_pte *Hash; unsigned long Hash_size, Hash_mask; unsigned long _SDR1; static unsigned int hash_mb, hash_mb2; @@ -345,8 +345,6 @@ void __init MMU_init_hw(void) __func__, Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 31fce3914ddc3..7b8833d695d16 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -104,7 +104,7 @@ extern int __map_without_bats; extern unsigned int rtas_data, rtas_size; struct hash_pte; -extern struct hash_pte *Hash, *Hash_end; +extern struct hash_pte *Hash; extern unsigned long Hash_size, Hash_mask; extern u8 early_hash[]; From 8f156c23f4c04ca51961cd1f6a0edbc80caa2683 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:37 +0000 Subject: [PATCH 135/205] powerpc/32s: don't try to print hash table address. Due to %p, (ptrval) is printed in lieu of the hash table address. showing the hash table address isn't an operationnal need so just don't print it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/book3s32/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 03265de05637d..131cd3acb6b89 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -345,8 +345,8 @@ void __init MMU_init_hw(void) __func__, Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; - printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", - (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + pr_info("Total memory = %lldMB; using %ldkB for hash table\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10); Hash_mask = n_hpteg - 1; From e4dccf9092ab48a6f902003b9558c0e45d0e849a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 26 Apr 2019 16:36:39 +0000 Subject: [PATCH 136/205] powerpc/mm: print hash info in a helper Reduce #ifdef mess by defining a helper to print hash info at startup. In the meantime, remove the display of hash table address to reduce leak of non necessary information. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 22 +--------------------- arch/powerpc/mm/book3s32/mmu.c | 9 ++++++++- arch/powerpc/mm/book3s64/hash_utils.c | 13 +++++++++++++ arch/powerpc/mm/mmu_decl.h | 5 ++++- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 91d2c6970bdb9..3f8805d3c0c99 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -800,12 +800,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_BOOK3S_64 - pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); -#endif -#ifdef CONFIG_PPC_BOOK3S_32 - pr_info("Hash_size = 0x%lx\n", Hash_size); -#endif pr_info("phys_mem_size = 0x%llx\n", (unsigned long long)memblock_phys_mem_size()); @@ -827,21 +821,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_BOOK3S_64 - if (htab_address) - pr_info("htab_address = 0x%p\n", htab_address); - if (htab_hash_mask) - pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); -#endif -#ifdef CONFIG_PPC_BOOK3S_32 - if (Hash) - pr_info("Hash = 0x%p\n", Hash); - if (Hash_mask) - pr_info("Hash_mask = 0x%lx\n", Hash_mask); -#endif + print_system_hash_info(); if (PHYSICAL_START > 0) pr_info("physical_start = 0x%llx\n", diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 131cd3acb6b89..615f78d355366 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -37,7 +37,7 @@ #include struct hash_pte *Hash; -unsigned long Hash_size, Hash_mask; +static unsigned long Hash_size, Hash_mask; unsigned long _SDR1; static unsigned int hash_mb, hash_mb2; @@ -401,6 +401,13 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); } +void __init print_system_hash_info(void) +{ + pr_info("Hash_size = 0x%lx\n", Hash_size); + if (Hash_mask) + pr_info("Hash_mask = 0x%lx\n", Hash_mask); +} + #ifdef CONFIG_PPC_KUEP void __init setup_kuep(bool disabled) { diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index f0ce860a69ace..919a861a8ec06 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -66,6 +66,8 @@ #include #include +#include + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -1945,3 +1947,14 @@ static int __init hash64_debugfs(void) } machine_device_initcall(pseries, hash64_debugfs); #endif /* CONFIG_DEBUG_FS */ + +void __init print_system_hash_info(void) +{ + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + + if (htab_hash_mask) + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); + pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); + pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); +} diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7b8833d695d16..7bac0aa2026a1 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -83,6 +83,8 @@ static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, } #endif +static inline void print_system_hash_info(void) {} + #else /* CONFIG_PPC_MMU_NOHASH */ extern void hash_preload(struct mm_struct *mm, unsigned long ea, @@ -92,6 +94,8 @@ extern void hash_preload(struct mm_struct *mm, unsigned long ea, extern void _tlbie(unsigned long address); extern void _tlbia(void); +void print_system_hash_info(void); + #endif /* CONFIG_PPC_MMU_NOHASH */ #ifdef CONFIG_PPC32 @@ -105,7 +109,6 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash; -extern unsigned long Hash_size, Hash_mask; extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ From 8a23fdec3dbdc8bfde6f806d36e773236dab6663 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:50 +0000 Subject: [PATCH 137/205] powerpc/32: Refactor EXCEPTION entry macros for head_8xx.S and head_32.S EXCEPTION_PROLOG is similar in head_8xx.S and head_32.S This patch creates head_32.h and moves EXCEPTION_PROLOG macro into it. It also converts it from a GCC macro to a GAS macro in order to ease refactorisation with 40x later, since GAS macros allows the use of #ifdef/#else/#endif inside it. And it also has the advantage of not requiring the uggly "; \" at the end of each line. This patch also moves EXCEPTION() and EXC_XFER_XXXX() macros which are also similar while adding START_EXCEPTION() out of EXCEPTION(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 99 +-------------------------- arch/powerpc/kernel/head_32.h | 118 +++++++++++++++++++++++++++++++++ arch/powerpc/kernel/head_8xx.S | 98 +-------------------------- 3 files changed, 122 insertions(+), 193 deletions(-) create mode 100644 arch/powerpc/kernel/head_32.h diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 73288df1c5d66..f98e6b4612380 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -37,6 +37,8 @@ #include #include +#include "head_32.h" + /* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ @@ -246,103 +248,6 @@ __secondary_hold_spinloop: __secondary_hold_acknowledge: .long -1 -/* - * Exception entry code. This code runs with address translation - * turned off, i.e. using physical addresses. - * We assume sprg3 has the physical address of the current - * task's thread_struct. - */ -#define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG_SCRATCH0,r10; \ - mtspr SPRN_SPRG_SCRATCH1,r11; \ - mfcr r10; \ - EXCEPTION_PROLOG_1; \ - EXCEPTION_PROLOG_2 - -#define EXCEPTION_PROLOG_1 \ - mfspr r11,SPRN_SRR1; /* check whether user or kernel */ \ - andi. r11,r11,MSR_PR; \ - tophys(r11,r1); /* use tophys(r1) if kernel */ \ - beq 1f; \ - mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,TASK_STACK-THREAD(r11); \ - addi r11,r11,THREAD_SIZE; \ - tophys(r11,r11); \ -1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ - - -#define EXCEPTION_PROLOG_2 \ - stw r10,_CCR(r11); /* save registers */ \ - stw r12,GPR12(r11); \ - stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG_SCRATCH0; \ - stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG_SCRATCH1; \ - stw r12,GPR11(r11); \ - mflr r10; \ - stw r10,_LINK(r11); \ - mfspr r12,SPRN_SRR0; \ - mfspr r9,SPRN_SRR1; \ - stw r1,GPR1(r11); \ - stw r1,0(r11); \ - tovirt(r1,r11); /* set new kernel sp */ \ - li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ - MTMSRD(r10); /* (except for mach check in rtas) */ \ - stw r0,GPR0(r11); \ - lis r10,STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \ - addi r10,r10,STACK_FRAME_REGS_MARKER@l; \ - stw r10,8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) - -/* - * Note: code which follows this uses cr0.eq (set if from kernel), - * r11, r12 (SRR0), and r9 (SRR1). - * - * Note2: once we have set r1 we are in a position to take exceptions - * again, and we could thus set MSR:RI at that point. - */ - -/* - * Exception vectors. - */ -#define EXCEPTION(n, label, hdlr, xfer) \ - . = n; \ - DO_KVM n; \ -label: \ - EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) - -#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - li r10,MSR_KERNEL; \ - copyee(r10, r9); \ - bl tfer; \ -i##n: \ - .long hdlr; \ - .long ret - -#define COPY_EE(d, s) rlwimi d,s,0,16,16 -#define NOCOPY(d, s) - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \ - ret_from_except) - -#define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \ - ret_from_except) - /* System reset */ /* core99 pmac starts the seconary here by changing the vector, and putting it back to what it was (unknown_exception) when done. */ diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h new file mode 100644 index 0000000000000..7456e2a45acc1 --- /dev/null +++ b/arch/powerpc/kernel/head_32.h @@ -0,0 +1,118 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __HEAD_32_H__ +#define __HEAD_32_H__ + +#include /* for STACK_FRAME_REGS_MARKER */ + +/* + * Exception entry code. This code runs with address translation + * turned off, i.e. using physical addresses. + * We assume sprg3 has the physical address of the current + * task's thread_struct. + */ + +.macro EXCEPTION_PROLOG + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 + mfcr r10 + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 +.endm + +.macro EXCEPTION_PROLOG_1 + mfspr r11,SPRN_SRR1 /* check whether user or kernel */ + andi. r11,r11,MSR_PR + tophys(r11,r1) /* use tophys(r1) if kernel */ + beq 1f + mfspr r11,SPRN_SPRG_THREAD + lwz r11,TASK_STACK-THREAD(r11) + addi r11,r11,THREAD_SIZE + tophys(r11,r11) +1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ +.endm + +.macro EXCEPTION_PROLOG_2 + stw r10,_CCR(r11) /* save registers */ + stw r12,GPR12(r11) + stw r9,GPR9(r11) + mfspr r10,SPRN_SPRG_SCRATCH0 + stw r10,GPR10(r11) + mfspr r12,SPRN_SPRG_SCRATCH1 + stw r12,GPR11(r11) + mflr r10 + stw r10,_LINK(r11) + mfspr r12,SPRN_SRR0 + mfspr r9,SPRN_SRR1 + stw r1,GPR1(r11) + stw r1,0(r11) + tovirt(r1,r11) /* set new kernel sp */ + li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ + MTMSRD(r10) /* (except for mach check in rtas) */ + stw r0,GPR0(r11) + lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + addi r10,r10,STACK_FRAME_REGS_MARKER@l + stw r10,8(r11) + SAVE_4GPRS(3, r11) + SAVE_2GPRS(7, r11) +.endm + +/* + * Note: code which follows this uses cr0.eq (set if from kernel), + * r11, r12 (SRR0), and r9 (SRR1). + * + * Note2: once we have set r1 we are in a position to take exceptions + * again, and we could thus set MSR:RI at that point. + */ + +/* + * Exception vectors. + */ +#ifdef CONFIG_PPC_BOOK3S +#define START_EXCEPTION(n, label) \ + . = n; \ + DO_KVM n; \ +label: + +#else +#define START_EXCEPTION(n, label) \ + . = n; \ +label: + +#endif + +#define EXCEPTION(n, label, hdlr, xfer) \ + START_EXCEPTION(n, label) \ + EXCEPTION_PROLOG; \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + xfer(n, hdlr) + +#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret) \ + li r10,trap; \ + stw r10,_TRAP(r11); \ + li r10,MSR_KERNEL; \ + copyee(r10, r9); \ + bl tfer; \ +i##n: \ + .long hdlr; \ + .long ret + +#define COPY_EE(d, s) rlwimi d,s,0,MSR_EE +#define NOCOPY(d, s) + +#define EXC_XFER_STD(n, hdlr) \ + EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full, \ + ret_from_except_full) + +#define EXC_XFER_LITE(n, hdlr) \ + EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \ + ret_from_except) + +#define EXC_XFER_EE(n, hdlr) \ + EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \ + ret_from_except_full) + +#define EXC_XFER_EE_LITE(n, hdlr) \ + EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \ + ret_from_except) + +#endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index d25adb6ef2350..14c3eb3267b82 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -33,6 +33,8 @@ #include #include +#include "head_32.h" + #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 /* By simply checking Address >= 0x80000000, we know if its a kernel address */ #define SIMPLE_KERNEL_ADDRESS 1 @@ -123,102 +125,6 @@ instruction_counter: .space 4 #endif -/* - * Exception entry code. This code runs with address translation - * turned off, i.e. using physical addresses. - * We assume sprg3 has the physical address of the current - * task's thread_struct. - */ -#define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG_SCRATCH0, r10; \ - mtspr SPRN_SPRG_SCRATCH1, r11; \ - mfcr r10; \ - EXCEPTION_PROLOG_1; \ - EXCEPTION_PROLOG_2 - -#define EXCEPTION_PROLOG_1 \ - mfspr r11,SPRN_SRR1; /* check whether user or kernel */ \ - andi. r11,r11,MSR_PR; \ - tophys(r11,r1); /* use tophys(r1) if kernel */ \ - beq 1f; \ - mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,TASK_STACK-THREAD(r11); \ - addi r11,r11,THREAD_SIZE; \ - tophys(r11,r11); \ -1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ - - -#define EXCEPTION_PROLOG_2 \ - stw r10,_CCR(r11); /* save registers */ \ - stw r12,GPR12(r11); \ - stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG_SCRATCH0; \ - stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG_SCRATCH1; \ - stw r12,GPR11(r11); \ - mflr r10; \ - stw r10,_LINK(r11); \ - mfspr r12,SPRN_SRR0; \ - mfspr r9,SPRN_SRR1; \ - stw r1,GPR1(r11); \ - stw r1,0(r11); \ - tovirt(r1,r11); /* set new kernel sp */ \ - li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ - mtmsr r10; \ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */ \ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) - -/* - * Note: code which follows this uses cr0.eq (set if from kernel), - * r11, r12 (SRR0), and r9 (SRR1). - * - * Note2: once we have set r1 we are in a position to take exceptions - * again, and we could thus set MSR:RI at that point. - */ - -/* - * Exception vectors. - */ -#define EXCEPTION(n, label, hdlr, xfer) \ - . = n; \ -label: \ - EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) - -#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - li r10,MSR_KERNEL; \ - copyee(r10, r9); \ - bl tfer; \ -i##n: \ - .long hdlr; \ - .long ret - -#define COPY_EE(d, s) rlwimi d,s,0,16,16 -#define NOCOPY(d, s) - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \ - ret_from_except) - -#define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \ - ret_from_except) - /* System reset */ EXCEPTION(0x100, Reset, system_reset_exception, EXC_XFER_STD) From 37737a2afd69c201d0dac334c84fd1f0d596dfc0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:51 +0000 Subject: [PATCH 138/205] powerpc/32: move LOAD_MSR_KERNEL() into head_32.h and use it As preparation for using head_32.h for head_40x.S, move LOAD_MSR_KERNEL() there and use it to load r10 with MSR_KERNEL value. In the mean time, this patch modifies it so that it takes into account the size of the passed value to determine if 'li' can be used or if 'lis/ori' is needed instead of using the size of MSR_KERNEL. This is done by using gas macro. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 9 +-------- arch/powerpc/kernel/head_32.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 2f3d159c11d79..d0cea3deb86c0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -38,14 +38,7 @@ #include #include -/* - * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE. - */ -#if MSR_KERNEL >= 0x10000 -#define LOAD_MSR_KERNEL(r, x) lis r,(x)@h; ori r,r,(x)@l -#else -#define LOAD_MSR_KERNEL(r, x) li r,(x) -#endif +#include "head_32.h" /* * Align to 4k in order to ensure that all functions modyfing srr0/srr1 diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 7456e2a45acc1..cf3d008445973 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -4,6 +4,19 @@ #include /* for STACK_FRAME_REGS_MARKER */ +/* + * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE. + */ +.macro __LOAD_MSR_KERNEL r, x +.if \x >= 0x8000 + lis \r, (\x)@h + ori \r, \r, (\x)@l +.else + li \r, (\x) +.endif +.endm +#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x + /* * Exception entry code. This code runs with address translation * turned off, i.e. using physical addresses. @@ -89,7 +102,7 @@ #define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ - li r10,MSR_KERNEL; \ + LOAD_MSR_KERNEL(r10, MSR_KERNEL); \ copyee(r10, r9); \ bl tfer; \ i##n: \ From 1d3034aed4489ae96bc7eec5050096944fd181f6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:52 +0000 Subject: [PATCH 139/205] powerpc/32: make the 6xx/8xx EXC_XFER_TEMPLATE() similar to the 40x/booke one 6xx/8xx EXC_XFER_TEMPLATE() macro adds a i##n symbol which is unused and can be removed. 40x and booke EXC_XFER_TEMPLATE() macros takes msr from the caller while the 6xx/8xx version uses only MSR_KERNEL as msr value. This patch modifies the 6xx/8xx version to make it similar to the 40x and booke versions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index cf3d008445973..985758cbf5779 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -99,13 +99,12 @@ addi r3,r1,STACK_FRAME_OVERHEAD; \ xfer(n, hdlr) -#define EXC_XFER_TEMPLATE(n, hdlr, trap, copyee, tfer, ret) \ +#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ - LOAD_MSR_KERNEL(r10, MSR_KERNEL); \ + LOAD_MSR_KERNEL(r10, msr); \ copyee(r10, r9); \ bl tfer; \ -i##n: \ .long hdlr; \ .long ret @@ -113,19 +112,19 @@ i##n: \ #define NOCOPY(d, s) #define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, NOCOPY, transfer_to_handler_full, \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \ ret_from_except_full) #define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, NOCOPY, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ ret_from_except) #define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n, COPY_EE, transfer_to_handler_full, \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ ret_from_except_full) #define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(n, hdlr, n+1, COPY_EE, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \ ret_from_except) #endif /* __HEAD_32_H__ */ From 57bc13acbe11b6d60d5dc4d574c34e1d981a8824 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:53 +0000 Subject: [PATCH 140/205] powerpc/40x: Don't use SPRN_SPRG_SCRATCH2 in EXCEPTION_PROLOG Unlike said in the comment, r1 is not reused by the critical exception handler, as it uses a dedicated critirq_ctx stack. Decrementing r1 early is then unneeded. Should the above be valid, the code is crap buggy anyway as r1 gets some intermediate values that would jeopardise the whole process (for instance after mfspr r1,SPRN_SPRG_THREAD) Using SPRN_SPRG_SCRATCH2 to save r1 is then not needed, r11 can be used instead. This avoids one mtspr and one mfspr and makes the prolog closer to what's done on 6xx and 8xx. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_40x.S | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index efa219d2136e7..fa033203dcdbb 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -102,23 +102,20 @@ _ENTRY(saved_ksp_limit) * Exception vector entry code. This code runs with address translation * turned off (i.e. using physical addresses). We assume SPRG_THREAD has * the physical address of the current task thread_struct. - * Note that we have to have decremented r1 before we write to any fields - * of the exception frame, since a critical interrupt could occur at any - * time, and it will write to the area immediately below the current r1. */ #define NORMAL_EXCEPTION_PROLOG \ mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ mtspr SPRN_SPRG_SCRATCH1,r11; \ - mtspr SPRN_SPRG_SCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ - beq 1f; \ - mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack */\ - addi r1,r1,THREAD_SIZE; \ -1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ tophys(r11,r1); \ + beq 1f; \ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ + lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ + addi r11,r11,THREAD_SIZE; \ + tophys(r11,r11); \ +1: subi r11,r11,INT_FRAME_SIZE; /* Allocate an exception frame */\ stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ @@ -128,11 +125,11 @@ _ENTRY(saved_ksp_limit) stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG_SCRATCH2; \ mfspr r12,SPRN_SRR0; \ - stw r10,GPR1(r11); \ + stw r1,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ - stw r10,0(r11); \ + stw r1,0(r11); \ + tovirt(r1,r11); /* set new kernel sp */ \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ stw r0,GPR0(r11); \ SAVE_4GPRS(3, r11); \ From bd82904d465c0655fdc40dfc753209ea54efdd23 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:54 +0000 Subject: [PATCH 141/205] powerpc/40x: add exception frame marker This patch adds STACK_FRAME_REGS_MARKER in the stack at exception entry in order to see interrupts in call traces as below: [ 0.013964] Call Trace: [ 0.014014] [c0745db0] [c007a9d4] tick_periodic.constprop.5+0xd8/0x104 (unreliable) [ 0.014086] [c0745dc0] [c007aa20] tick_handle_periodic+0x20/0x9c [ 0.014181] [c0745de0] [c0009cd0] timer_interrupt+0xa0/0x264 [ 0.014258] [c0745e10] [c000e484] ret_from_except+0x0/0x14 [ 0.014390] --- interrupt: 901 at console_unlock.part.7+0x3f4/0x528 [ 0.014390] LR = console_unlock.part.7+0x3f0/0x528 [ 0.014455] [c0745ee0] [c0050334] console_unlock.part.7+0x114/0x528 (unreliable) [ 0.014542] [c0745f30] [c00524e0] register_console+0x3d8/0x44c [ 0.014625] [c0745f60] [c0675aac] cpm_uart_console_init+0x18/0x2c [ 0.014709] [c0745f70] [c06614f4] console_init+0x114/0x1cc [ 0.014795] [c0745fb0] [c0658b68] start_kernel+0x300/0x3d8 [ 0.014864] [c0745ff0] [c00022cc] start_here+0x44/0x98 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_40x.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index fa033203dcdbb..be1fcd6147c12 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -132,6 +132,9 @@ _ENTRY(saved_ksp_limit) tovirt(r1,r11); /* set new kernel sp */ \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ stw r0,GPR0(r11); \ + lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\ + addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ + stw r10, 8(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) @@ -174,6 +177,9 @@ _ENTRY(saved_ksp_limit) tovirt(r1,r11); \ rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ stw r0,GPR0(r11); \ + lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\ + addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ + stw r10, 8(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) From 7271fc960424a2fed3823a57358f67f650fd708d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:55 +0000 Subject: [PATCH 142/205] powerpc/40x: Split and rename NORMAL_EXCEPTION_PROLOG This patch splits NORMAL_EXCEPTION_PROLOG in the same way as in head_8xx.S and head_32.S and renames it EXCEPTION_PROLOG() as well to match head_32.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_40x.S | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index be1fcd6147c12..004ebe823bd49 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -103,10 +103,14 @@ _ENTRY(saved_ksp_limit) * turned off (i.e. using physical addresses). We assume SPRG_THREAD has * the physical address of the current task thread_struct. */ -#define NORMAL_EXCEPTION_PROLOG \ +#define EXCEPTION_PROLOG \ mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; /* save CR in r10 for now */\ + EXCEPTION_PROLOG_1; \ + EXCEPTION_PROLOG_2 + +#define EXCEPTION_PROLOG_1 \ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ tophys(r11,r1); \ @@ -115,7 +119,9 @@ _ENTRY(saved_ksp_limit) lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ -1: subi r11,r11,INT_FRAME_SIZE; /* Allocate an exception frame */\ +1: subi r11,r11,INT_FRAME_SIZE /* Allocate an exception frame */ + +#define EXCEPTION_PROLOG_2 \ stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ @@ -205,7 +211,7 @@ label: #define EXCEPTION(n, label, hdlr, xfer) \ START_EXCEPTION(n, label); \ - NORMAL_EXCEPTION_PROLOG; \ + EXCEPTION_PROLOG; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ xfer(n, hdlr) @@ -396,7 +402,7 @@ label: * This is caused by a fetch from non-execute or guarded pages. */ START_EXCEPTION(0x0400, InstructionAccess) - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG mr r4,r12 /* Pass SRR0 as arg2 */ li r5,0 /* Pass zero as arg3 */ EXC_XFER_LITE(0x400, handle_page_fault) @@ -406,7 +412,7 @@ label: /* 0x0600 - Alignment Exception */ START_EXCEPTION(0x0600, Alignment) - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG mfspr r4,SPRN_DEAR /* Grab the DEAR and save it */ stw r4,_DEAR(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -414,7 +420,7 @@ label: /* 0x0700 - Program Exception */ START_EXCEPTION(0x0700, ProgramCheck) - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG mfspr r4,SPRN_ESR /* Grab the ESR and save it */ stw r4,_ESR(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -427,7 +433,7 @@ label: /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG EXC_XFER_EE_LITE(0xc00, DoSyscall) EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE) @@ -733,7 +739,7 @@ label: /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ Decrementer: - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG lis r0,TSR_PIS@h mtspr SPRN_TSR,r0 /* Clear the PIT exception */ addi r3,r1,STACK_FRAME_OVERHEAD @@ -741,7 +747,7 @@ Decrementer: /* Fixed Interval Timer (FIT) Exception. (from 0x1010) */ FITException: - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG addi r3,r1,STACK_FRAME_OVERHEAD; EXC_XFER_EE(0x1010, unknown_exception) @@ -759,7 +765,7 @@ WDTException: * if they can't resolve the lightweight TLB fault. */ DataAccess: - NORMAL_EXCEPTION_PROLOG + EXCEPTION_PROLOG mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ From 90f204b9a1f2dc81904547a52ba976d3e84dcf59 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:56 +0000 Subject: [PATCH 143/205] powerpc/40x: Refactor exception entry macros by using head_32.h Refactor exception entry macros by using the ones defined in head_32.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.h | 4 ++ arch/powerpc/kernel/head_40x.S | 88 +--------------------------------- 2 files changed, 6 insertions(+), 86 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 985758cbf5779..aa0131bb09b50 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -59,8 +59,12 @@ stw r1,GPR1(r11) stw r1,0(r11) tovirt(r1,r11) /* set new kernel sp */ +#ifdef CONFIG_40x + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#else li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ MTMSRD(r10) /* (except for mach check in rtas) */ +#endif stw r0,GPR0(r11) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ addi r10,r10,STACK_FRAME_REGS_MARKER@l diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 004ebe823bd49..b3a2e55e1c15e 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -44,6 +44,8 @@ #include #include +#include "head_32.h" + /* As with the other PowerPC ports, it is expected that when code * execution begins here, the following registers contain valid, yet * optional, information: @@ -98,52 +100,6 @@ _ENTRY(crit_srr1) _ENTRY(saved_ksp_limit) .space 4 -/* - * Exception vector entry code. This code runs with address translation - * turned off (i.e. using physical addresses). We assume SPRG_THREAD has - * the physical address of the current task thread_struct. - */ -#define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG_SCRATCH1,r11; \ - mfcr r10; /* save CR in r10 for now */\ - EXCEPTION_PROLOG_1; \ - EXCEPTION_PROLOG_2 - -#define EXCEPTION_PROLOG_1 \ - mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ - andi. r11,r11,MSR_PR; \ - tophys(r11,r1); \ - beq 1f; \ - mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ - addi r11,r11,THREAD_SIZE; \ - tophys(r11,r11); \ -1: subi r11,r11,INT_FRAME_SIZE /* Allocate an exception frame */ - -#define EXCEPTION_PROLOG_2 \ - stw r10,_CCR(r11); /* save various registers */\ - stw r12,GPR12(r11); \ - stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG_SCRATCH0; \ - stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG_SCRATCH1; \ - stw r12,GPR11(r11); \ - mflr r10; \ - stw r10,_LINK(r11); \ - mfspr r12,SPRN_SRR0; \ - stw r1,GPR1(r11); \ - mfspr r9,SPRN_SRR1; \ - stw r1,0(r11); \ - tovirt(r1,r11); /* set new kernel sp */ \ - rlwinm r9,r9,0,14,12; /* clear MSR_WE (necessary?) */\ - stw r0,GPR0(r11); \ - lis r10, STACK_FRAME_REGS_MARKER@ha; /* exception frame marker */\ - addi r10, r10, STACK_FRAME_REGS_MARKER@l; \ - stw r10, 8(r11); \ - SAVE_4GPRS(3, r11); \ - SAVE_2GPRS(7, r11) - /* * Exception prolog for critical exceptions. This is a little different * from the normal exception prolog above since a critical exception @@ -205,16 +161,6 @@ _ENTRY(saved_ksp_limit) /* * Exception vectors. */ -#define START_EXCEPTION(n, label) \ - . = n; \ -label: - -#define EXCEPTION(n, label, hdlr, xfer) \ - START_EXCEPTION(n, label); \ - EXCEPTION_PROLOG; \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ - xfer(n, hdlr) - #define CRITICAL_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(n, label); \ CRITICAL_EXCEPTION_PROLOG; \ @@ -223,36 +169,6 @@ label: NOCOPY, crit_transfer_to_handler, \ ret_from_crit_exc) -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret) \ - li r10,trap; \ - stw r10,_TRAP(r11); \ - lis r10,msr@h; \ - ori r10,r10,msr@l; \ - copyee(r10, r9); \ - bl tfer; \ - .long hdlr; \ - .long ret - -#define COPY_EE(d, s) rlwimi d,s,0,16,16 -#define NOCOPY(d, s) - -#define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ - ret_from_except) - -#define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \ - ret_from_except) - - /* * 0x0100 - Critical Interrupt Exception */ From ef4291243f51d0a69899ee2025de09578c0fcba8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:57 +0000 Subject: [PATCH 144/205] powerpc/fsl_booke: ensure SPEFloatingPointException() reenables interrupts SPEFloatingPointException() is the only exception handler which 'forgets' to re-enable interrupts. This patch makes sure it does. Suggested-by: Benjamin Herrenschmidt Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1fd45a8650e17..665f294725cb2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -2088,6 +2088,10 @@ void SPEFloatingPointException(struct pt_regs *regs) int code = FPE_FLTUNK; int err; + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + flush_spe_to_thread(current); spefscr = current->thread.spefscr; @@ -2133,6 +2137,10 @@ void SPEFloatingPointRoundException(struct pt_regs *regs) extern int speround_handler(struct pt_regs *regs); int err; + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + preempt_disable(); if (regs->msr & MSR_SPE) giveup_spe(current); From f97dec21a306967edbc49ce46f3ecefa3cd16907 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:58 +0000 Subject: [PATCH 145/205] powerpc/32: enter syscall with MSR_EE inconditionaly set syscalls are expected to be entered with MSR_EE set. Lets make it inconditional by forcing MSR_EE on syscalls. This patch adds EXC_XFER_SYS for that. Suggested-by: Benjamin Herrenschmidt [splited out from benh RFC patch] Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 2 +- arch/powerpc/kernel/head_32.h | 4 ++++ arch/powerpc/kernel/head_40x.S | 2 +- arch/powerpc/kernel/head_44x.S | 2 +- arch/powerpc/kernel/head_8xx.S | 2 +- arch/powerpc/kernel/head_booke.h | 4 ++++ arch/powerpc/kernel/head_fsl_booke.S | 2 +- 7 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index f98e6b4612380..c5fd76d0caf6b 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -375,7 +375,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) DO_KVM 0xc00 SystemCall: EXCEPTION_PROLOG - EXC_XFER_EE_LITE(0xc00, DoSyscall) + EXC_XFER_SYS(0xc00, DoSyscall) /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index aa0131bb09b50..7221418a883fc 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -123,6 +123,10 @@ EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ ret_from_except) +#define EXC_XFER_SYS(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ + ret_from_except) + #define EXC_XFER_EE(n, hdlr) \ EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ ret_from_except_full) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index b3a2e55e1c15e..3e1b8a85cc0db 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -350,7 +350,7 @@ _ENTRY(saved_ksp_limit) /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) EXCEPTION_PROLOG - EXC_XFER_EE_LITE(0xc00, DoSyscall) + EXC_XFER_SYS(0xc00, DoSyscall) EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE) EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_EE) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 34a5df827b38e..19268713b6929 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -286,7 +286,7 @@ interrupt_base: /* System Call Interrupt */ START_EXCEPTION(SystemCall) NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL) - EXC_XFER_EE_LITE(0x0c00, DoSyscall) + EXC_XFER_SYS(0x0c00, DoSyscall) /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 14c3eb3267b82..aa8e629f77258 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -186,7 +186,7 @@ Alignment: . = 0xc00 SystemCall: EXCEPTION_PROLOG - EXC_XFER_EE_LITE(0xc00, DoSyscall) + EXC_XFER_SYS(0xc00, DoSyscall) /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 1b22a8dea3996..612f54ba11259 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -251,6 +251,10 @@ END_BTB_FLUSH_SECTION EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ ret_from_except) +#define EXC_XFER_SYS(n, hdlr) \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ + ret_from_except) + #define EXC_XFER_EE(n, hdlr) \ EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ ret_from_except_full) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 567e0ed45ca81..a7bebb9963930 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -414,7 +414,7 @@ interrupt_base: /* System Call Interrupt */ START_EXCEPTION(SystemCall) NORMAL_EXCEPTION_PROLOG(SYSCALL) - EXC_XFER_EE_LITE(0x0c00, DoSyscall) + EXC_XFER_SYS(0x0c00, DoSyscall) /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ From 642770dd96cb04e7cf8f7677e35cd528cda0a97b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:38:59 +0000 Subject: [PATCH 146/205] powerpc/32: Enter exceptions with MSR_EE unset All exceptions handlers know when to reenable interrupts, so it is safer to enter all of them with MSR_EE unset, except for syscalls. Suggested-by: Benjamin Herrenschmidt [splited out from benh RFC patch] Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.S | 68 ++++++++++++++-------------- arch/powerpc/kernel/head_32.h | 8 ---- arch/powerpc/kernel/head_40x.S | 44 +++++++++--------- arch/powerpc/kernel/head_44x.S | 6 +-- arch/powerpc/kernel/head_8xx.S | 32 ++++++------- arch/powerpc/kernel/head_booke.h | 12 +---- arch/powerpc/kernel/head_fsl_booke.S | 26 +++++------ 7 files changed, 90 insertions(+), 106 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index c5fd76d0caf6b..7f5555e362a1b 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -341,7 +341,7 @@ Alignment: mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE(0x600, alignment_exception) + EXC_XFER_STD(0x600, alignment_exception) /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) @@ -362,13 +362,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) bl load_up_fpu /* if from user, just load it up */ b fast_exception_return 1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception) + EXC_XFER_LITE(0x800, kernel_fp_unavailable_exception) /* Decrementer */ EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE) + EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) + EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) /* System call */ . = 0xc00 @@ -379,7 +379,7 @@ SystemCall: /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE) + EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) /* * The Altivec unavailable trap is at 0x0f20. Foo. @@ -611,35 +611,35 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) #define altivec_assist_exception unknown_exception #endif - EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_EE) - EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_EE) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_EE) + EXCEPTION(0x1300, Trap_13, instruction_breakpoint_exception, EXC_XFER_STD) + EXCEPTION(0x1400, SMI, SMIException, EXC_XFER_STD) + EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1600, Trap_16, altivec_assist_exception, EXC_XFER_STD) EXCEPTION(0x1700, Trap_17, TAUException, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_EE) - EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1c00, Trap_1c, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2000, RunMode, RunModeException, EXC_XFER_STD) + EXCEPTION(0x2100, Trap_21, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2200, Trap_22, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2300, Trap_23, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2400, Trap_24, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2500, Trap_25, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2600, Trap_26, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2700, Trap_27, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2800, Trap_28, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2900, Trap_29, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2a00, Trap_2a, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2b00, Trap_2b, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_STD) . = 0x3000 @@ -651,7 +651,7 @@ AltiVecUnavailable: b fast_exception_return #endif /* CONFIG_ALTIVEC */ 1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE_LITE(0xf20, altivec_unavailable_exception) + EXC_XFER_LITE(0xf20, altivec_unavailable_exception) PerformanceMonitor: EXCEPTION_PROLOG diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 7221418a883fc..8881b6887841f 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -127,12 +127,4 @@ EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ ret_from_except) -#define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \ - ret_from_except) - #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 3e1b8a85cc0db..0463f56fc7ab8 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -332,7 +332,7 @@ _ENTRY(saved_ksp_limit) mfspr r4,SPRN_DEAR /* Grab the DEAR and save it */ stw r4,_DEAR(r11) addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE(0x600, alignment_exception) + EXC_XFER_STD(0x600, alignment_exception) /* 0x0700 - Program Exception */ START_EXCEPTION(0x0700, ProgramCheck) @@ -342,19 +342,19 @@ _ENTRY(saved_ksp_limit) addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x700, program_check_exception) - EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x0800, Trap_08, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0900, Trap_09, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0A00, Trap_0A, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0B00, Trap_0B, unknown_exception, EXC_XFER_STD) /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) EXCEPTION_PROLOG EXC_XFER_SYS(0xc00, DoSyscall) - EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x0F00, Trap_0F, unknown_exception, EXC_XFER_STD) /* 0x1000 - Programmable Interval Timer (PIT) Exception */ . = 0x1000 @@ -571,25 +571,25 @@ _ENTRY(saved_ksp_limit) mfspr r10, SPRN_SPRG_SCRATCH0 b InstructionAccess - EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1400, Trap_14, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) #ifdef CONFIG_IBM405_ERR51 /* 405GP errata 51 */ START_EXCEPTION(0x1700, Trap_17) b DTLBMiss #else - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) #endif - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1A00, Trap_1A, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1B00, Trap_1B, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1C00, Trap_1C, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1D00, Trap_1D, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1E00, Trap_1E, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1F00, Trap_1F, unknown_exception, EXC_XFER_STD) /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case @@ -665,7 +665,7 @@ Decrementer: FITException: EXCEPTION_PROLOG addi r3,r1,STACK_FRAME_OVERHEAD; - EXC_XFER_EE(0x1010, unknown_exception) + EXC_XFER_STD(0x1010, unknown_exception) /* Watchdog Timer (WDT) Exception. (from 0x1020) */ WDTException: diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 19268713b6929..0381fdb294a60 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -281,7 +281,7 @@ interrupt_base: FP_UNAVAILABLE_EXCEPTION #else EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \ - FloatingPointUnavailable, unknown_exception, EXC_XFER_EE) + FloatingPointUnavailable, unknown_exception, EXC_XFER_STD) #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) @@ -290,7 +290,7 @@ interrupt_base: /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ - AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE) + AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_STD) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION @@ -298,7 +298,7 @@ interrupt_base: /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) /* Watchdog Timer Interrupt */ /* TODO: Add watchdog support */ diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index aa8e629f77258..16b4791a2a9fa 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -167,7 +167,7 @@ Alignment: mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE(0x600, alignment_exception) + EXC_XFER_STD(0x600, alignment_exception) /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) @@ -179,8 +179,8 @@ Alignment: /* Decrementer */ EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_EE) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_EE) + EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) + EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) /* System call */ . = 0xc00 @@ -190,8 +190,8 @@ SystemCall: /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_EE) + EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) + EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_STD) /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. @@ -521,13 +521,13 @@ DARFixed:/* Return from dcbx instruction bug workaround */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to @@ -549,7 +549,7 @@ DataBreakpoint: mfspr r4,SPRN_BAR stw r4,_DAR(r11) mfspr r5,SPRN_DSISR - EXC_XFER_EE(0x1c00, do_break) + EXC_XFER_STD(0x1c00, do_break) 11: mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 @@ -569,10 +569,10 @@ InstructionBreakpoint: mfspr r10, SPRN_SPRG_SCRATCH0 rfi #else - EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_STD) #endif - EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_EE) + EXCEPTION(0x1e00, Trap_1e, unknown_exception, EXC_XFER_STD) + EXCEPTION(0x1f00, Trap_1f, unknown_exception, EXC_XFER_STD) . = 0x2000 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 612f54ba11259..264976c43f344 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -255,14 +255,6 @@ END_BTB_FLUSH_SECTION EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ ret_from_except) -#define EXC_XFER_EE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, COPY_EE, transfer_to_handler_full, \ - ret_from_except_full) - -#define EXC_XFER_EE_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, COPY_EE, transfer_to_handler, \ - ret_from_except) - /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case * where an instruction that we are trying to single step causes @@ -405,7 +397,7 @@ END_BTB_FLUSH_SECTION mfspr r4,SPRN_DEAR; /* Grab the DEAR and save it */ \ stw r4,_DEAR(r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_EE(0x0600, alignment_exception) + EXC_XFER_STD(0x0600, alignment_exception) #define PROGRAM_EXCEPTION \ START_EXCEPTION(Program) \ @@ -430,7 +422,7 @@ END_BTB_FLUSH_SECTION bl load_up_fpu; /* if from user, just load it up */ \ b fast_exception_return; \ 1: addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_EE_LITE(0x800, kernel_fp_unavailable_exception) + EXC_XFER_STD(0x800, kernel_fp_unavailable_exception) #ifndef __ASSEMBLY__ struct exception_regs { diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index a7bebb9963930..df980ad0f95f5 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -383,7 +383,7 @@ interrupt_base: EXC_XFER_LITE(0x0300, handle_page_fault) 1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE_LITE(0x0300, CacheLockingException) + EXC_XFER_LITE(0x0300, CacheLockingException) /* Instruction Storage Interrupt */ INSTRUCTION_STORAGE_EXCEPTION @@ -404,10 +404,10 @@ interrupt_base: #ifdef CONFIG_E200 /* E200 treats 'normal' floating point instructions as FP Unavail exception */ EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ - program_check_exception, EXC_XFER_EE) + program_check_exception, EXC_XFER_STD) #else EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) #endif #endif @@ -418,7 +418,7 @@ interrupt_base: /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) /* Decrementer Interrupt */ DECREMENTER_EXCEPTION @@ -426,7 +426,7 @@ interrupt_base: /* Fixed Internal Timer Interrupt */ /* TODO: Add FIT support */ EXCEPTION(0x3100, FIT, FixedIntervalTimer, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) /* Watchdog Timer Interrupt */ #ifdef CONFIG_BOOKE_WDT @@ -636,25 +636,25 @@ END_BTB_FLUSH_SECTION bl load_up_spe b fast_exception_return 1: addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_EE_LITE(0x2010, KernelSPE) + EXC_XFER_LITE(0x2010, KernelSPE) #elif defined(CONFIG_SPE_POSSIBLE) EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) #endif /* CONFIG_SPE_POSSIBLE */ /* SPE Floating Point Data */ #ifdef CONFIG_SPE EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, - SPEFloatingPointException, EXC_XFER_EE) + SPEFloatingPointException, EXC_XFER_STD) /* SPE Floating Point Round */ EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - SPEFloatingPointRoundException, EXC_XFER_EE) + SPEFloatingPointRoundException, EXC_XFER_STD) #elif defined(CONFIG_SPE_POSSIBLE) EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \ - unknown_exception, EXC_XFER_EE) + unknown_exception, EXC_XFER_STD) #endif /* CONFIG_SPE_POSSIBLE */ @@ -677,10 +677,10 @@ END_BTB_FLUSH_SECTION unknown_exception) /* Hypercall */ - EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE) + EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_STD) /* Embedded Hypervisor Privilege */ - EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE) + EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_STD) interrupt_end: From 1ae99b4b924ab10452da653baed29d3883705519 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:00 +0000 Subject: [PATCH 147/205] powerpc/32: get rid of COPY_EE in exception entry EXC_XFER_TEMPLATE() is not called with COPY_EE anymore so we can get rid of copyee parameters and related COPY_EE and NOCOPY macros. Suggested-by: Benjamin Herrenschmidt [splited out from benh RFC patch] Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_32.h | 12 ++++-------- arch/powerpc/kernel/head_40x.S | 8 +++----- arch/powerpc/kernel/head_booke.h | 22 ++++++++-------------- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 8881b6887841f..14cb0af2f4946 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -103,28 +103,24 @@ addi r3,r1,STACK_FRAME_OVERHEAD; \ xfer(n, hdlr) -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret) \ +#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ LOAD_MSR_KERNEL(r10, msr); \ - copyee(r10, r9); \ bl tfer; \ .long hdlr; \ .long ret -#define COPY_EE(d, s) rlwimi d,s,0,MSR_EE -#define NOCOPY(d, s) - #define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ ret_from_except_full) #define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) #define EXC_XFER_SYS(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \ ret_from_except) #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 0463f56fc7ab8..fc332d33112e5 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -166,8 +166,7 @@ _ENTRY(saved_ksp_limit) CRITICAL_EXCEPTION_PROLOG; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - NOCOPY, crit_transfer_to_handler, \ - ret_from_crit_exc) + crit_transfer_to_handler, ret_from_crit_exc) /* * 0x0100 - Critical Interrupt Exception @@ -651,7 +650,7 @@ _ENTRY(saved_ksp_limit) addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_TEMPLATE(DebugException, 0x2002, \ (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - NOCOPY, crit_transfer_to_handler, ret_from_crit_exc) + crit_transfer_to_handler, ret_from_crit_exc) /* Programmable Interval Timer (PIT) Exception. (from 0x1000) */ Decrementer: @@ -673,8 +672,7 @@ WDTException: addi r3,r1,STACK_FRAME_OVERHEAD; EXC_XFER_TEMPLATE(WatchdogException, 0x1020+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), - NOCOPY, crit_transfer_to_handler, - ret_from_crit_exc) + crit_transfer_to_handler, ret_from_crit_exc) /* * The other Data TLB exceptions bail out to this point diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 264976c43f344..56dd1341eb3df 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -217,8 +217,7 @@ END_BTB_FLUSH_SECTION CRITICAL_EXCEPTION_PROLOG(intno); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - NOCOPY, crit_transfer_to_handler, \ - ret_from_crit_exc) + crit_transfer_to_handler, ret_from_crit_exc) #define MCHECK_EXCEPTION(n, label, hdlr) \ START_EXCEPTION(label); \ @@ -227,32 +226,27 @@ END_BTB_FLUSH_SECTION stw r5,_ESR(r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ EXC_XFER_TEMPLATE(hdlr, n+4, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \ - NOCOPY, mcheck_transfer_to_handler, \ - ret_from_mcheck_exc) + mcheck_transfer_to_handler, ret_from_mcheck_exc) -#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret) \ +#define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ lis r10,msr@h; \ ori r10,r10,msr@l; \ - copyee(r10, r9); \ bl tfer; \ .long hdlr; \ .long ret -#define COPY_EE(d, s) rlwimi d,s,0,16,16 -#define NOCOPY(d, s) - #define EXC_XFER_STD(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, NOCOPY, transfer_to_handler_full, \ + EXC_XFER_TEMPLATE(hdlr, n, MSR_KERNEL, transfer_to_handler_full, \ ret_from_except_full) #define EXC_XFER_LITE(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) #define EXC_XFER_SYS(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, NOCOPY, transfer_to_handler, \ + EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \ ret_from_except) /* Check for a single step debug exception while in an exception @@ -319,7 +313,7 @@ END_BTB_FLUSH_SECTION /* continue normal handling for a debug exception... */ \ 2: mfspr r4,SPRN_DBSR; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, debug_transfer_to_handler, ret_from_debug_exc) + EXC_XFER_TEMPLATE(DebugException, 0x2008, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), debug_transfer_to_handler, ret_from_debug_exc) #define DEBUG_CRIT_EXCEPTION \ START_EXCEPTION(DebugCrit); \ @@ -372,7 +366,7 @@ END_BTB_FLUSH_SECTION /* continue normal handling for a critical exception... */ \ 2: mfspr r4,SPRN_DBSR; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ - EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), NOCOPY, crit_transfer_to_handler, ret_from_crit_exc) + EXC_XFER_TEMPLATE(DebugException, 0x2002, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), crit_transfer_to_handler, ret_from_crit_exc) #define DATA_STORAGE_EXCEPTION \ START_EXCEPTION(DataStorage) \ From 40530db7c656119b1671aae5bc27811f66f5f424 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:01 +0000 Subject: [PATCH 148/205] powerpc: Fix 32-bit handling of MSR_EE on exceptions [text mostly copied from benh's RFC/WIP] ppc32 are still doing something rather gothic and wrong on 32-bit which we stopped doing on 64-bit a while ago. We have that thing where some handlers "copy" the EE value from the original stack frame into the new MSR before transferring to the handler. Thus for a number of exceptions, we enter the handlers with interrupts enabled. This is rather fishy, some of the stuff that handlers might do early on such as irq_enter/exit or user_exit, context tracking, etc... should be run with interrupts off afaik. Generally our handlers know when to re-enable interrupts if needed. The problem we were having is that we assumed these interrupts would return with interrupts enabled. However that isn't the case. Instead, this patch changes things so that we always enter exception handlers with interrupts *off* with the notable exception of syscalls which are special (and get a fast path). Suggested-by: Benjamin Herrenschmidt Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 116 +++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index d0cea3deb86c0..0c555f9f15435 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -37,6 +37,7 @@ #include #include #include +#include #include "head_32.h" @@ -206,19 +207,42 @@ transfer_to_handler_cont: mtspr SPRN_NRI, r0 #endif #ifdef CONFIG_TRACE_IRQFLAGS + /* + * When tracing IRQ state (lockdep) we enable the MMU before we call + * the IRQ tracing functions as they might access vmalloc space or + * perform IOs for console output. + * + * To speed up the syscall path where interrupts stay on, let's check + * first if we are changing the MSR value at all. + */ + tophys(r12, r1) + lwz r12,_MSR(r12) + xor r12,r10,r12 + andi. r12,r12,MSR_EE + bne 1f + + /* MSR isn't changing, just transition directly */ +#endif + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r10 + mtlr r9 + SYNC + RFI /* jump to handler, enable MMU */ + +#ifdef CONFIG_TRACE_IRQFLAGS +1: /* MSR is changing, re-enable MMU so we can notify lockdep. We need to + * keep interrupts disabled at this point otherwise we might risk + * taking an interrupt before we tell lockdep they are enabled. + */ lis r12,reenable_mmu@h ori r12,r12,reenable_mmu@l + LOAD_MSR_KERNEL(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r10 + mtspr SPRN_SRR1,r0 SYNC RFI -reenable_mmu: /* re-enable mmu so we can */ - mfmsr r10 - lwz r12,_MSR(r1) - xor r10,r10,r12 - andi. r10,r10,MSR_EE /* Did EE change? */ - beq 1f +reenable_mmu: /* * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1. * If from user mode there is only one stack frame on the stack, and @@ -233,14 +257,24 @@ reenable_mmu: /* re-enable mmu so we can */ * they aren't useful past this point (aren't syscall arguments), * the rest is restored from the exception frame. */ + + /* Are we enabling or disabling interrupts ? */ + andi. r0,r10,MSR_EE + stwu r1,-32(r1) stw r9,8(r1) stw r11,12(r1) stw r3,16(r1) stw r4,20(r1) stw r5,24(r1) - bl trace_hardirqs_off - lwz r5,24(r1) + + bne- 0f + + /* If we are disabling interrupts (normal case), simply log it with + * lockdep + */ +1: bl trace_hardirqs_off +2: lwz r5,24(r1) lwz r4,20(r1) lwz r3,16(r1) lwz r11,12(r1) @@ -250,15 +284,22 @@ reenable_mmu: /* re-enable mmu so we can */ lwz r6,GPR6(r1) lwz r7,GPR7(r1) lwz r8,GPR8(r1) -1: mtctr r11 + mtctr r11 mtlr r9 bctr /* jump to handler */ -#else /* CONFIG_TRACE_IRQFLAGS */ - mtspr SPRN_SRR0,r11 - mtspr SPRN_SRR1,r10 - mtlr r9 - SYNC - RFI /* jump to handler, enable MMU */ + + /* If we are enabling interrupt, this is a syscall. They shouldn't + * happen while interrupts are disabled, so let's do a warning here. + */ +0: trap + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + bl trace_hardirqs_on + + /* Now enable for real */ + mfmsr r10 + ori r10,r10,MSR_EE + mtmsr r10 + b 2b #endif /* CONFIG_TRACE_IRQFLAGS */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) @@ -316,29 +357,13 @@ _GLOBAL(DoSyscall) rlwinm r11,r11,0,4,2 stw r11,_CCR(r1) #ifdef CONFIG_TRACE_IRQFLAGS - /* Return from syscalls can (and generally will) hard enable - * interrupts. You aren't supposed to call a syscall with - * interrupts disabled in the first place. However, to ensure - * that we get it right vs. lockdep if it happens, we force - * that hard enable here with appropriate tracing if we see - * that we have been called with interrupts off - */ + /* Make sure interrupts are enabled */ mfmsr r11 andi. r12,r11,MSR_EE - bne+ 1f - /* We came in with interrupts disabled, we enable them now */ - bl trace_hardirqs_on - mfmsr r11 - lwz r0,GPR0(r1) - lwz r3,GPR3(r1) - lwz r4,GPR4(r1) - ori r11,r11,MSR_EE - lwz r5,GPR5(r1) - lwz r6,GPR6(r1) - lwz r7,GPR7(r1) - lwz r8,GPR8(r1) - mtmsr r11 -1: + /* We came in with interrupts disabled, we WARN and mark them enabled + * for lockdep now */ +0: tweqi r12, 0 + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING #endif /* CONFIG_TRACE_IRQFLAGS */ lwz r11,TI_FLAGS(r2) andi. r11,r11,_TIF_SYSCALL_DOTRACE @@ -392,8 +417,7 @@ syscall_exit_cont: lwz r8,_MSR(r1) #ifdef CONFIG_TRACE_IRQFLAGS /* If we are going to return from the syscall with interrupts - * off, we trace that here. It shouldn't happen though but we - * want to catch the bugger if it does right ? + * off, we trace that here. It shouldn't normally happen. */ andi. r10,r8,MSR_EE bne+ 1f @@ -918,13 +942,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) * off in this assembly code while peeking at TI_FLAGS() and such. However * we need to inform it if the exception turned interrupts off, and we * are about to trun them back on. - * - * The problem here sadly is that we don't know whether the exceptions was - * one that turned interrupts off or not. So we always tell lockdep about - * turning them on here when we go back to wherever we came from with EE - * on, even if that may meen some redudant calls being tracked. Maybe later - * we could encode what the exception did somewhere or test the exception - * type in the pt_regs but that sounds overkill */ andi. r10,r9,MSR_EE beq 1f @@ -1212,9 +1229,10 @@ do_work: /* r10 contains MSR_KERNEL here */ beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ - /* Note: We don't need to inform lockdep that we are enabling - * interrupts here. As far as it knows, they are already enabled - */ +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_on + mfmsr r10 +#endif ori r10,r10,MSR_EE SYNC MTMSRD(r10) /* hard-enable interrupts */ From b86fb88855ea7881314b935df1df6b1ef1bd0c32 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:02 +0000 Subject: [PATCH 149/205] powerpc/32: implement fast entry for syscalls on non BOOKE This patch implements a fast entry for syscalls. Syscalls don't have to preserve non volatile registers except LR. This patch then implement a fast entry for syscalls, where volatile registers get clobbered. As this entry is dedicated to syscall it always sets MSR_EE and warns in case MSR_EE was previously off It also assumes that the call is always from user, system calls are unexpected from kernel. The overall series improves null_syscall selftest by 12,5% on an 83xx and by 17% on a 8xx. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 32 +++++++++++++ arch/powerpc/kernel/head_32.S | 3 +- arch/powerpc/kernel/head_32.h | 85 ++++++++++++++++++++++++++++++++-- arch/powerpc/kernel/head_40x.S | 3 +- arch/powerpc/kernel/head_8xx.S | 3 +- 5 files changed, 116 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0c555f9f15435..184cc1de2f37e 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -342,6 +342,35 @@ stack_ovf: SYNC RFI +#ifndef CONFIG_BOOKE /* to be removed once BOOKE uses fast syscall entry */ +#ifdef CONFIG_TRACE_IRQFLAGS +trace_syscall_entry_irq_off: + /* + * Syscall shouldn't happen while interrupts are disabled, + * so let's do a warning here. + */ +0: trap + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + bl trace_hardirqs_on + + /* Now enable for real */ + LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + mtmsr r10 + + REST_GPR(0, r1) + REST_4GPRS(3, r1) + REST_2GPRS(7, r1) + b DoSyscall +#endif /* CONFIG_TRACE_IRQFLAGS */ + + .globl transfer_to_syscall +transfer_to_syscall: +#ifdef CONFIG_TRACE_IRQFLAGS + andi. r12,r9,MSR_EE + beq- trace_syscall_entry_irq_off +#endif /* CONFIG_TRACE_IRQFLAGS */ +#endif /* !CONFIG_BOOKE */ + /* * Handle a system call. */ @@ -353,9 +382,11 @@ _GLOBAL(DoSyscall) stw r3,ORIG_GPR3(r1) li r12,0 stw r12,RESULT(r1) +#ifdef CONFIG_BOOKE /* to be removed once BOOKE uses fast syscall entry */ lwz r11,_CCR(r1) /* Clear SO bit in CR */ rlwinm r11,r11,0,4,2 stw r11,_CCR(r1) +#endif #ifdef CONFIG_TRACE_IRQFLAGS /* Make sure interrupts are enabled */ mfmsr r11 @@ -1219,6 +1250,7 @@ load_dbcr0: .section .bss .align 4 + .global global_dbcr0 global_dbcr0: .space 8*NR_CPUS .previous diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 7f5555e362a1b..755fab9641d61 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -374,8 +374,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) . = 0xc00 DO_KVM 0xc00 SystemCall: - EXCEPTION_PROLOG - EXC_XFER_SYS(0xc00, DoSyscall) + SYSCALL_ENTRY 0xc00 /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 14cb0af2f4946..4a692553651f0 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -73,6 +73,87 @@ SAVE_2GPRS(7, r11) .endm +.macro SYSCALL_ENTRY trapno + mfspr r12,SPRN_SPRG_THREAD + mfcr r10 + lwz r11,TASK_STACK-THREAD(r12) + mflr r9 + addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE + rlwinm r10,r10,0,4,2 /* Clear SO bit in CR */ + tophys(r11,r11) + stw r10,_CCR(r11) /* save registers */ + mfspr r10,SPRN_SRR0 + stw r9,_LINK(r11) + mfspr r9,SPRN_SRR1 + stw r1,GPR1(r11) + stw r1,0(r11) + tovirt(r1,r11) /* set new kernel sp */ + stw r10,_NIP(r11) +#ifdef CONFIG_40x + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ +#else + LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ + MTMSRD(r10) /* (except for mach check in rtas) */ +#endif + lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + stw r2,GPR2(r11) + addi r10,r10,STACK_FRAME_REGS_MARKER@l + stw r9,_MSR(r11) + li r2, \trapno + 1 + stw r10,8(r11) + stw r2,_TRAP(r11) + SAVE_GPR(0, r11) + SAVE_4GPRS(3, r11) + SAVE_2GPRS(7, r11) + addi r11,r1,STACK_FRAME_OVERHEAD + addi r2,r12,-THREAD + stw r11,PT_REGS(r12) +#if defined(CONFIG_40x) + /* Check to see if the dbcr0 register is set up to debug. Use the + internal debug mode bit to do this. */ + lwz r12,THREAD_DBCR0(r12) + andis. r12,r12,DBCR0_IDM@h +#endif + ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#if defined(CONFIG_40x) + beq+ 3f + /* From user and task is ptraced - load up global dbcr0 */ + li r12,-1 /* clear all pending debug events */ + mtspr SPRN_DBSR,r12 + lis r11,global_dbcr0@ha + tophys(r11,r11) + addi r11,r11,global_dbcr0@l + lwz r12,0(r11) + mtspr SPRN_DBCR0,r12 + lwz r12,4(r11) + addi r12,r12,-1 + stw r12,4(r11) +#endif + +3: + tovirt(r2, r2) /* set r2 to current */ + lis r11, transfer_to_syscall@h + ori r11, r11, transfer_to_syscall@l +#ifdef CONFIG_TRACE_IRQFLAGS + /* + * If MSR is changing we need to keep interrupts disabled at this point + * otherwise we might risk taking an interrupt before we tell lockdep + * they are enabled. + */ + LOAD_MSR_KERNEL(r10, MSR_KERNEL) + rlwimi r10, r9, 0, MSR_EE +#else + LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) +#endif +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) + mtspr SPRN_NRI, r0 +#endif + mtspr SPRN_SRR1,r10 + mtspr SPRN_SRR0,r11 + SYNC + RFI /* jump to handler, enable MMU */ +.endm + /* * Note: code which follows this uses cr0.eq (set if from kernel), * r11, r12 (SRR0), and r9 (SRR1). @@ -119,8 +200,4 @@ EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) -#define EXC_XFER_SYS(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \ - ret_from_except) - #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index fc332d33112e5..cf54b784100dc 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -348,8 +348,7 @@ _ENTRY(saved_ksp_limit) /* 0x0C00 - System Call Exception */ START_EXCEPTION(0x0C00, SystemCall) - EXCEPTION_PROLOG - EXC_XFER_SYS(0xc00, DoSyscall) + SYSCALL_ENTRY 0xc00 EXCEPTION(0x0D00, Trap_0D, unknown_exception, EXC_XFER_STD) EXCEPTION(0x0E00, Trap_0E, unknown_exception, EXC_XFER_STD) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 16b4791a2a9fa..885be7f3d29af 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -185,8 +185,7 @@ Alignment: /* System call */ . = 0xc00 SystemCall: - EXCEPTION_PROLOG - EXC_XFER_SYS(0xc00, DoSyscall) + SYSCALL_ENTRY 0xc00 /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) From 1a4b739bbb4f8857d1b4feb46d6b3ec72269c111 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:03 +0000 Subject: [PATCH 150/205] powerpc/32: implement fast entry for syscalls on BOOKE This patch implements a fast entry for syscalls. Syscalls don't have to preserve non volatile registers except LR. This patch then implement a fast entry for syscalls, where volatile registers get clobbered. As this entry is dedicated to syscall it always sets MSR_EE and warns in case MSR_EE was previously off It also assumes that the call is always from user, system calls are unexpected from kernel. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 7 -- arch/powerpc/kernel/head_44x.S | 3 +- arch/powerpc/kernel/head_booke.h | 103 +++++++++++++++++++++++++-- arch/powerpc/kernel/head_fsl_booke.S | 3 +- 4 files changed, 100 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 184cc1de2f37e..dc58fec51ed64 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -342,7 +342,6 @@ stack_ovf: SYNC RFI -#ifndef CONFIG_BOOKE /* to be removed once BOOKE uses fast syscall entry */ #ifdef CONFIG_TRACE_IRQFLAGS trace_syscall_entry_irq_off: /* @@ -369,7 +368,6 @@ transfer_to_syscall: andi. r12,r9,MSR_EE beq- trace_syscall_entry_irq_off #endif /* CONFIG_TRACE_IRQFLAGS */ -#endif /* !CONFIG_BOOKE */ /* * Handle a system call. @@ -382,11 +380,6 @@ _GLOBAL(DoSyscall) stw r3,ORIG_GPR3(r1) li r12,0 stw r12,RESULT(r1) -#ifdef CONFIG_BOOKE /* to be removed once BOOKE uses fast syscall entry */ - lwz r11,_CCR(r1) /* Clear SO bit in CR */ - rlwinm r11,r11,0,4,2 - stw r11,_CCR(r1) -#endif #ifdef CONFIG_TRACE_IRQFLAGS /* Make sure interrupts are enabled */ mfmsr r11 diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 0381fdb294a60..f15fba58c7440 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -285,8 +285,7 @@ interrupt_base: #endif /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL) - EXC_XFER_SYS(0x0c00, DoSyscall) + SYSCALL_ENTRY 0xc00 BOOKE_INTERRUPT_SYSCALL /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 56dd1341eb3df..bfeb469e81060 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -6,6 +6,8 @@ #include #include +#ifdef __ASSEMBLY__ + /* * Macros used for common Book-e exception handling */ @@ -81,6 +83,101 @@ END_BTB_FLUSH_SECTION SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) +.macro SYSCALL_ENTRY trapno intno + mfspr r10, SPRN_SPRG_THREAD +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mtspr SPRN_SPRG_WSCRATCH0, r10 + stw r11, THREAD_NORMSAVE(0)(r10) + stw r13, THREAD_NORMSAVE(2)(r10) + mfcr r13 /* save CR in r13 for now */ + mfspr r11, SPRN_SRR1 + mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ + bf 3, 1975f + b kvmppc_handler_BOOKE_INTERRUPT_\intno\()_SPRN_SRR1 +1975: + mr r12, r13 + lwz r13, THREAD_NORMSAVE(2)(r10) +FTR_SECTION_ELSE +#endif + mfcr r12 +#ifdef CONFIG_KVM_BOOKE_HV +ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) +#endif + BOOKE_CLEAR_BTB(r11) + lwz r11, TASK_STACK - THREAD(r10) + rlwinm r12,r12,0,4,2 /* Clear SO bit in CR */ + ALLOC_STACK_FRAME(r11, THREAD_SIZE - INT_FRAME_SIZE) + stw r12, _CCR(r11) /* save various registers */ + mflr r12 + stw r12,_LINK(r11) + mfspr r12,SPRN_SRR0 + stw r1, GPR1(r11) + mfspr r9,SPRN_SRR1 + stw r1, 0(r11) + mr r1, r11 + stw r12,_NIP(r11) + rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ + lis r12, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ + stw r2,GPR2(r11) + addi r12, r12, STACK_FRAME_REGS_MARKER@l + stw r9,_MSR(r11) + li r2, \trapno + 1 + stw r12, 8(r11) + stw r2,_TRAP(r11) + SAVE_GPR(0, r11) + SAVE_4GPRS(3, r11) + SAVE_2GPRS(7, r11) + + addi r11,r1,STACK_FRAME_OVERHEAD + addi r2,r10,-THREAD + stw r11,PT_REGS(r10) + /* Check to see if the dbcr0 register is set up to debug. Use the + internal debug mode bit to do this. */ + lwz r12,THREAD_DBCR0(r10) + andis. r12,r12,DBCR0_IDM@h + ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) + beq+ 3f + /* From user and task is ptraced - load up global dbcr0 */ + li r12,-1 /* clear all pending debug events */ + mtspr SPRN_DBSR,r12 + lis r11,global_dbcr0@ha + tophys(r11,r11) + addi r11,r11,global_dbcr0@l +#ifdef CONFIG_SMP + lwz r9,TASK_CPU(r2) + slwi r9,r9,3 + add r11,r11,r9 +#endif + lwz r12,0(r11) + mtspr SPRN_DBCR0,r12 + lwz r12,4(r11) + addi r12,r12,-1 + stw r12,4(r11) + +3: + tovirt(r2, r2) /* set r2 to current */ + lis r11, transfer_to_syscall@h + ori r11, r11, transfer_to_syscall@l +#ifdef CONFIG_TRACE_IRQFLAGS + /* + * If MSR is changing we need to keep interrupts disabled at this point + * otherwise we might risk taking an interrupt before we tell lockdep + * they are enabled. + */ + lis r10, MSR_KERNEL@h + ori r10, r10, MSR_KERNEL@l + rlwimi r10, r9, 0, MSR_EE +#else + lis r10, (MSR_KERNEL | MSR_EE)@h + ori r10, r10, (MSR_KERNEL | MSR_EE)@l +#endif + mtspr SPRN_SRR1,r10 + mtspr SPRN_SRR0,r11 + SYNC + RFI /* jump to handler, enable MMU */ +.endm + /* To handle the additional exception priority levels on 40x and Book-E * processors we allocate a stack per additional priority level. * @@ -245,10 +342,6 @@ END_BTB_FLUSH_SECTION EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) -#define EXC_XFER_SYS(n, hdlr) \ - EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL | MSR_EE, transfer_to_handler, \ - ret_from_except) - /* Check for a single step debug exception while in an exception * handler before state has been saved. This is to catch the case * where an instruction that we are trying to single step causes @@ -418,7 +511,7 @@ END_BTB_FLUSH_SECTION 1: addi r3,r1,STACK_FRAME_OVERHEAD; \ EXC_XFER_STD(0x800, kernel_fp_unavailable_exception) -#ifndef __ASSEMBLY__ +#else /* __ASSEMBLY__ */ struct exception_regs { unsigned long mas0; unsigned long mas1; diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index df980ad0f95f5..6621f230cc37b 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -413,8 +413,7 @@ interrupt_base: /* System Call Interrupt */ START_EXCEPTION(SystemCall) - NORMAL_EXCEPTION_PROLOG(SYSCALL) - EXC_XFER_SYS(0x0c00, DoSyscall) + SYSCALL_ENTRY 0xc00 SYSCALL /* Auxiliary Processor Unavailable Interrupt */ EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \ From 38b4564cf042ad3f5333692687023803c1ab1112 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:04 +0000 Subject: [PATCH 151/205] powerpc/32: don't do syscall stuff in transfer_to_handler As syscalls are now handled via a fast entry path, syscall related actions can be removed from the generic transfer_to_handler path. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index dc58fec51ed64..e65c3e70c6489 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -217,7 +217,6 @@ transfer_to_handler_cont: */ tophys(r12, r1) lwz r12,_MSR(r12) - xor r12,r10,r12 andi. r12,r12,MSR_EE bne 1f @@ -258,9 +257,6 @@ reenable_mmu: * the rest is restored from the exception frame. */ - /* Are we enabling or disabling interrupts ? */ - andi. r0,r10,MSR_EE - stwu r1,-32(r1) stw r9,8(r1) stw r11,12(r1) @@ -268,8 +264,6 @@ reenable_mmu: stw r4,20(r1) stw r5,24(r1) - bne- 0f - /* If we are disabling interrupts (normal case), simply log it with * lockdep */ @@ -287,19 +281,6 @@ reenable_mmu: mtctr r11 mtlr r9 bctr /* jump to handler */ - - /* If we are enabling interrupt, this is a syscall. They shouldn't - * happen while interrupts are disabled, so let's do a warning here. - */ -0: trap - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING - bl trace_hardirqs_on - - /* Now enable for real */ - mfmsr r10 - ori r10,r10,MSR_EE - mtmsr r10 - b 2b #endif /* CONFIG_TRACE_IRQFLAGS */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) From d1865e71cdc9b75b6a6716a2983eb5d6004cfca9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 30 Apr 2019 12:39:05 +0000 Subject: [PATCH 152/205] powerpc/32: Don't add dummy frames when calling trace_hardirqs_on/off No need to add dummy frames when calling trace_hardirqs_on or trace_hardirqs_off. GCC properly handles empty stacks. In addition, powerpc doesn't set CONFIG_FRAME_POINTER, therefore __builtin_return_address(1..) returns NULL at all time. So the dummy frames are definitely unneeded here. In the meantime, avoid reading memory for loading r1 with a value we already know. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index e65c3e70c6489..235a01d34b6d0 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -243,12 +243,7 @@ transfer_to_handler_cont: reenable_mmu: /* - * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1. - * If from user mode there is only one stack frame on the stack, and - * accessing CALLER_ADDR1 will cause oops. So we need create a dummy - * stack frame to make trace_hardirqs_off happy. - * - * This is handy because we also need to save a bunch of GPRs, + * We save a bunch of GPRs, * r3 can be different from GPR3(r1) at this point, r9 and r11 * contains the old MSR and handler address respectively, * r4 & r5 can contain page fault arguments that need to be passed @@ -950,18 +945,11 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) */ andi. r10,r9,MSR_EE beq 1f - /* - * Since the ftrace irqsoff latency trace checks CALLER_ADDR1, - * which is the stack frame here, we need to force a stack frame - * in case we came from user space. - */ stwu r1,-32(r1) mflr r0 stw r0,4(r1) - stwu r1,-32(r1) bl trace_hardirqs_on - lwz r1,0(r1) - lwz r1,0(r1) + addi r1, r1, 32 lwz r9,_MSR(r1) 1: #endif /* CONFIG_TRACE_IRQFLAGS */ From 9c1d38b34e944cace44e0d2bea0beb5601a4d36d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:39 +0000 Subject: [PATCH 153/205] powerpc/fadump: define an empty fadump_cleanup() To avoid #ifdefs, define an static inline fadump_cleanup() function when CONFIG_FADUMP is not selected Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/fadump.h | 1 + arch/powerpc/kernel/setup-common.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 188776befaf9b..e2099c0a15c3a 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -219,5 +219,6 @@ extern void fadump_cleanup(void); static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } +static inline void fadump_cleanup(void) { } #endif #endif diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 3f8805d3c0c99..13054980e11a2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -134,13 +134,11 @@ int crashing_cpu = -1; /* also used by kexec */ void machine_shutdown(void) { -#ifdef CONFIG_FA_DUMP /* * if fadump is active, cleanup the fadump registration before we * shutdown. */ fadump_cleanup(); -#endif if (ppc_md.machine_shutdown) ppc_md.machine_shutdown(); From 93f2cd813797baf5590459fb0439c62e873b7748 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:40 +0000 Subject: [PATCH 154/205] powerpc/mm: define an empty mm_iommu_init() To avoid ifdefs, define a empty static inline mm_iommu_init() function when CONFIG_SPAPR_TCE_IOMMU is not selected. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 1 + arch/powerpc/kernel/setup-common.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 66a3805dc9352..611204e588b9c 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -52,6 +52,7 @@ static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa, { return false; } +static inline void mm_iommu_init(struct mm_struct *mm) { } #endif extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 13054980e11a2..d06d50fe1e7e6 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -931,9 +931,7 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; -#ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); -#endif irqstack_early_init(); exc_lvl_early_init(); emergency_stack_init(); From e9e9b25a4c99eec0e678c78124ae79764d8f777a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:42 +0000 Subject: [PATCH 155/205] powerpc/setup: Remove unnecessary #ifdef CONFIG_ALTIVEC CPU_FTR_ALTIVEC is only set when CONFIG_ALTIVEC is selected, so the ifdef is unnecessary. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index d06d50fe1e7e6..3cb3774f380a6 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -251,10 +251,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) else seq_printf(m, "unknown (%08x)", pvr); -#ifdef CONFIG_ALTIVEC if (cpu_has_feature(CPU_FTR_ALTIVEC)) seq_printf(m, ", altivec supported"); -#endif /* CONFIG_ALTIVEC */ seq_printf(m, "\n"); From b5064efee2211f83b98a6a69e7319257c8411221 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:43 +0000 Subject: [PATCH 156/205] powerpc/setup: cleanup ifdef mess in check_cache_coherency() Use IS_ENABLED() instead of #ifdefs Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 3cb3774f380a6..6d3ebc40d21c0 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -737,23 +737,19 @@ void __init setup_panic(void) * BUG() in that case. */ -#ifdef CONFIG_NOT_COHERENT_CACHE -#define KERNEL_COHERENCY 0 -#else -#define KERNEL_COHERENCY 1 -#endif +#define KERNEL_COHERENCY (!IS_ENABLED(CONFIG_NOT_COHERENT_CACHE)) static int __init check_cache_coherency(void) { struct device_node *np; const void *prop; - int devtree_coherency; + bool devtree_coherency; np = of_find_node_by_path("/"); prop = of_get_property(np, "coherency-off", NULL); of_node_put(np); - devtree_coherency = prop ? 0 : 1; + devtree_coherency = prop ? false : true; if (devtree_coherency != KERNEL_COHERENCY) { printk(KERN_ERR From 48018e42e5c70c8ac4b222cc76af1a15ea2e09e7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:44 +0000 Subject: [PATCH 157/205] powerpc/setup: cleanup the #ifdef CONFIG_TAU block Use cpu_has_feature() instead of opencoding Use IS_ENABLED() instead of #ifdef for CONFIG_TAU_AVERAGE Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 6d3ebc40d21c0..c755fe6ec8ef5 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -257,18 +257,18 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "\n"); #ifdef CONFIG_TAU - if (cur_cpu_spec->cpu_features & CPU_FTR_TAU) { -#ifdef CONFIG_TAU_AVERAGE - /* more straightforward, but potentially misleading */ - seq_printf(m, "temperature \t: %u C (uncalibrated)\n", - cpu_temp(cpu_id)); -#else - /* show the actual temp sensor range */ - u32 temp; - temp = cpu_temp_both(cpu_id); - seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n", - temp & 0xff, temp >> 16); -#endif + if (cpu_has_feature(CPU_FTR_TAU)) { + if (IS_ENABLED(CONFIG_TAU_AVERAGE)) { + /* more straightforward, but potentially misleading */ + seq_printf(m, "temperature \t: %u C (uncalibrated)\n", + cpu_temp(cpu_id)); + } else { + /* show the actual temp sensor range */ + u32 temp; + temp = cpu_temp_both(cpu_id); + seq_printf(m, "temperature \t: %u-%u C (uncalibrated)\n", + temp & 0xff, temp >> 16); + } } #endif /* CONFIG_TAU */ From 65184f2f045abc0eb35f934f6cbf7e23b9875e7c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 22 Mar 2019 08:08:45 +0000 Subject: [PATCH 158/205] powerpc/setup: replace ifdefs by IS_ENABLED() wherever possible. Compared to ifdefs, IS_ENABLED() provide a cleaner code and allows to detect compilation failure regardless of the selected options. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 39 ++++++++++++++---------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index c755fe6ec8ef5..aad9f5df6ab6d 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -199,14 +199,15 @@ static void show_cpuinfo_summary(struct seq_file *m) { struct device_node *root; const char *model = NULL; -#if defined(CONFIG_SMP) && defined(CONFIG_PPC32) unsigned long bogosum = 0; int i; - for_each_online_cpu(i) - bogosum += loops_per_jiffy; - seq_printf(m, "total bogomips\t: %lu.%02lu\n", - bogosum/(500000/HZ), bogosum/(5000/HZ) % 100); -#endif /* CONFIG_SMP && CONFIG_PPC32 */ + + if (IS_ENABLED(CONFIG_SMP) && IS_ENABLED(CONFIG_PPC32)) { + for_each_online_cpu(i) + bogosum += loops_per_jiffy; + seq_printf(m, "total bogomips\t: %lu.%02lu\n", + bogosum / (500000 / HZ), bogosum / (5000 / HZ) % 100); + } seq_printf(m, "timebase\t: %lu\n", ppc_tb_freq); if (ppc_md.name) seq_printf(m, "platform\t: %s\n", ppc_md.name); @@ -220,11 +221,10 @@ static void show_cpuinfo_summary(struct seq_file *m) if (ppc_md.show_cpuinfo != NULL) ppc_md.show_cpuinfo(m); -#ifdef CONFIG_PPC32 /* Display the amount of memory */ - seq_printf(m, "Memory\t\t: %d MB\n", - (unsigned int)(total_memory / (1024 * 1024))); -#endif + if (IS_ENABLED(CONFIG_PPC32)) + seq_printf(m, "Memory\t\t: %d MB\n", + (unsigned int)(total_memory / (1024 * 1024))); } static int show_cpuinfo(struct seq_file *m, void *v) @@ -332,11 +332,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "revision\t: %hd.%hd (pvr %04x %04x)\n", maj, min, PVR_VER(pvr), PVR_REV(pvr)); -#ifdef CONFIG_PPC32 - seq_printf(m, "bogomips\t: %lu.%02lu\n", - loops_per_jiffy / (500000/HZ), - (loops_per_jiffy / (5000/HZ)) % 100); -#endif + if (IS_ENABLED(CONFIG_PPC32)) + seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ), + (loops_per_jiffy / (5000 / HZ)) % 100); + seq_printf(m, "\n"); /* If this is the last cpu, print the summary */ @@ -934,9 +933,9 @@ void __init setup_arch(char **cmdline_p) early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); -#ifdef CONFIG_DUMMY_CONSOLE - conswitchp = &dummy_con; -#endif + if (IS_ENABLED(CONFIG_DUMMY_CONSOLE)) + conswitchp = &dummy_con; + if (ppc_md.setup_arch) ppc_md.setup_arch(); @@ -948,10 +947,8 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff. */ mmu_context_init(); -#ifdef CONFIG_PPC64 /* Interrupt code needs to be 64K-aligned. */ - if ((unsigned long)_stext & 0xffff) + if (IS_ENABLED(CONFIG_PPC64) && (unsigned long)_stext & 0xffff) panic("Kernelbase not 64K-aligned (0x%lx)!\n", (unsigned long)_stext); -#endif } From 502523fd1d2ac559b41d8302dc9f826f578ec54d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 9 Mar 2019 18:47:27 +0100 Subject: [PATCH 159/205] powerpc/irq: drop __irq_offset_value This patch drops__irq_offset_value which has not been used since commit 9c4cb8251513 ("powerpc: Remove use of CONFIG_PPC_MERGE") This removes a sparse warning. Fixes: 9c4cb8251513 ("powerpc: Remove use of CONFIG_PPC_MERGE") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8a936723c791b..6672fec75e2a7 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -81,10 +81,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -int __irq_offset_value; - #ifdef CONFIG_PPC32 -EXPORT_SYMBOL(__irq_offset_value); atomic_t ppc_n_lost_interrupts; #ifdef CONFIG_TAU_INT From e2b36d591720d81741f37e047a6f0047e8c89369 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 2 May 2019 15:21:07 +1000 Subject: [PATCH 160/205] powerpc/64: Don't trace code that runs with the soft irq mask unreconciled "Reconciling" in terms of interrupt handling, is to bring the soft irq mask state in to synch with the hardware, after an interrupt causes MSR[EE] to be cleared (while the soft mask may be enabled, and hard irqs not marked disabled). General kernel code should not be called while unreconciled, because local_irq_disable, etc. manipulations can cause surprising irq traces, and it's fragile because the soft irq code does not really expect to be called in this situation. When exiting from an interrupt, MSR[EE] is cleared to prevent races, but soft irq state is enabled for the returned-to context, so this is now an unreconciled state. restore_math is called in this state, and that can be ftraced, and the ftrace subsystem disables local irqs. Mark restore_math and its callees as notrace. Restore a sanity check in the soft irq code that had to be disabled for this case, by commit 4da1f79227ad4 ("powerpc/64: Disable irq restore warning for now"). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fpu.S | 1 + arch/powerpc/kernel/irq.c | 13 +++---------- arch/powerpc/kernel/process.c | 18 +++++++++++++++--- arch/powerpc/kernel/vector.S | 1 + 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 529dcc21c3f91..cecd57e1d0468 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -63,6 +63,7 @@ _GLOBAL(load_fp_state) REST_32FPVSRS(0, R4, R3) blr EXPORT_SYMBOL(load_fp_state) +_ASM_NOKPROBE_SYMBOL(load_fp_state); /* used by restore_math */ /* * Store FP state into memory, including FPSCR diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 6672fec75e2a7..ada901af4950c 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -258,16 +258,9 @@ notrace void arch_local_irq_restore(unsigned long mask) */ irq_happened = get_irq_happened(); if (!irq_happened) { - /* - * FIXME. Here we'd like to be able to do: - * - * #ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - * WARN_ON(!(mfmsr() & MSR_EE)); - * #endif - * - * But currently it hits in a few paths, we should fix those and - * enable the warning. - */ +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON(!(mfmsr() & MSR_EE)); +#endif return; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0c20173570731..87da40129927c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -134,7 +134,8 @@ static int __init enable_strict_msr_control(char *str) } early_param("ppc_strict_facility_enable", enable_strict_msr_control); -unsigned long msr_check_and_set(unsigned long bits) +/* notrace because it's called by restore_math */ +unsigned long notrace msr_check_and_set(unsigned long bits) { unsigned long oldmsr = mfmsr(); unsigned long newmsr; @@ -153,7 +154,8 @@ unsigned long msr_check_and_set(unsigned long bits) } EXPORT_SYMBOL_GPL(msr_check_and_set); -void __msr_check_and_clear(unsigned long bits) +/* notrace because it's called by restore_math */ +void notrace __msr_check_and_clear(unsigned long bits) { unsigned long oldmsr = mfmsr(); unsigned long newmsr; @@ -526,7 +528,17 @@ void giveup_all(struct task_struct *tsk) } EXPORT_SYMBOL(giveup_all); -void restore_math(struct pt_regs *regs) +/* + * The exception exit path calls restore_math() with interrupts hard disabled + * but the soft irq state not "reconciled". ftrace code that calls + * local_irq_save/restore causes warnings. + * + * Rather than complicate the exit path, just don't trace restore_math. This + * could be done by having ftrace entry code check for this un-reconciled + * condition where MSR[EE]=0 and PACA_IRQ_HARD_DIS is not set, and + * temporarily fix it up for the duration of the ftrace call. + */ +void notrace restore_math(struct pt_regs *regs) { unsigned long msr; diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 21165da0052d0..8eb867dbad5fd 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -21,6 +21,7 @@ _GLOBAL(load_vr_state) REST_32VRS(0,r4,r3) blr EXPORT_SYMBOL(load_vr_state) +_ASM_NOKPROBE_SYMBOL(load_vr_state); /* used by restore_math */ /* * Store VMX state into memory, including VSCR. From 4c1bd90477c60618eb6dfba2e92d3a287b0c6255 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 2 May 2019 16:00:41 +1000 Subject: [PATCH 161/205] MAINTAINERS: Update cxl/ocxl email address Use my @linux.ibm.com email to avoid a layer of redirection. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e9e3783fd3236..6e8885de90786 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4291,7 +4291,7 @@ F: drivers/net/ethernet/chelsio/cxgb4vf/ CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER M: Frederic Barrat -M: Andrew Donnellan +M: Andrew Donnellan L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/platforms/powernv/pci-cxl.c @@ -11170,7 +11170,7 @@ F: tools/objtool/ OCXL (Open Coherent Accelerator Processor Interface OpenCAPI) DRIVER M: Frederic Barrat -M: Andrew Donnellan +M: Andrew Donnellan L: linuxppc-dev@lists.ozlabs.org S: Supported F: arch/powerpc/platforms/powernv/ocxl.c From c9e0fc33b8be52a7134ed0ee79b6a1e332e1b9d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 30 Apr 2019 14:27:39 -0400 Subject: [PATCH 162/205] powerpc: remove the __kernel_io_end export This export was added in this merge window, but without any actual user, or justification for a modular user. Fixes: a35a3c6f6065 ("powerpc/mm/hash64: Add a variable to track the end of IO mapping") Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 95ed765194115..4c6a73782b19f 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -97,7 +97,6 @@ EXPORT_SYMBOL(__vmalloc_end); unsigned long __kernel_io_start; EXPORT_SYMBOL(__kernel_io_start); unsigned long __kernel_io_end; -EXPORT_SYMBOL(__kernel_io_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; From 5f18cbdbdd42b050c51eb9859f8ce43db3f51846 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 2 May 2019 17:39:46 +1000 Subject: [PATCH 163/205] powerpc/mm/ptdump: Wrap seq_printf() to handle NULL pointers Lovingly borrowed from the arch/arm64 ptdump code. This doesn't seem to be an issue in practice, but is necessary for my upcoming commit. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/mm/ptdump/ptdump.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 48135ba6fa74d..e249a56c07bfb 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -108,6 +108,18 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_putc(m, c) \ +({ \ + if (m) \ + seq_putc(m, c); \ +}) + static void dump_flag_info(struct pg_state *st, const struct flag_info *flag, u64 pte, int num) { @@ -125,19 +137,19 @@ static void dump_flag_info(struct pg_state *st, const struct flag_info val = pte & flag->val; if (flag->shift) val = val >> flag->shift; - seq_printf(st->seq, " %s:%llx", flag->set, val); + pt_dump_seq_printf(st->seq, " %s:%llx", flag->set, val); } else { if ((pte & flag->mask) == flag->val) s = flag->set; else s = flag->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } st->current_flags &= ~flag->mask; } if (st->current_flags != 0) - seq_printf(st->seq, " unknown flags:%llx", st->current_flags); + pt_dump_seq_printf(st->seq, " unknown flags:%llx", st->current_flags); } static void dump_addr(struct pg_state *st, unsigned long addr) @@ -152,12 +164,12 @@ static void dump_addr(struct pg_state *st, unsigned long addr) #define REG "0x%08lx" #endif - seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { - seq_printf(st->seq, "[" REG "]", st->start_pa); + pt_dump_seq_printf(st->seq, "[" REG "]", st->start_pa); delta = PAGE_SIZE >> 10; } else { - seq_printf(st->seq, " " REG " ", st->start_pa); + pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); delta = (addr - st->start_address) >> 10; } /* Work out what appropriate unit to use */ @@ -165,7 +177,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c", delta, *unit); + pt_dump_seq_printf(st->seq, "%9lu%c", delta, *unit); } @@ -182,7 +194,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->start_address = addr; st->start_pa = pa; st->last_pa = pa; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -206,7 +218,7 @@ static void note_page(struct pg_state *st, unsigned long addr, st->current_flags, pg_level[st->level].num); - seq_putc(st->seq, '\n'); + pt_dump_seq_putc(st->seq, '\n'); } /* @@ -215,7 +227,7 @@ static void note_page(struct pg_state *st, unsigned long addr, */ while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; st->start_pa = pa; From 453d87f6a8aed827f5ebb1708a4cea458fd68d23 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Thu, 2 May 2019 17:39:47 +1000 Subject: [PATCH 164/205] powerpc/mm: Warn if W+X pages found on boot Implement code to walk all pages and warn if any are found to be both writable and executable. Depends on STRICT_KERNEL_RWX enabled, and is behind the DEBUG_WX config option. This only runs on boot and has no runtime performance implications. Very heavily influenced (and in some cases copied verbatim) from the ARM64 code written by Laura Abbott (thanks!), since our ptdump infrastructure is similar. Signed-off-by: Russell Currey [mpe: Fixup build error when disabled] Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 19 +++++++++++++ arch/powerpc/include/asm/pgtable.h | 6 +++++ arch/powerpc/mm/pgtable_32.c | 3 +++ arch/powerpc/mm/pgtable_64.c | 3 +++ arch/powerpc/mm/ptdump/ptdump.c | 43 +++++++++++++++++++++++++++++- 5 files changed, 73 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 61febbbdd02b3..e9ae650c8e938 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -361,6 +361,25 @@ config PPC_PTDUMP If you are unsure, say N. +config PPC_DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on PPC_PTDUMP + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index c51846da41a77..3f53be60fb014 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -105,6 +105,12 @@ void mark_initmem_nx(void); static inline void mark_initmem_nx(void) { } #endif +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void); +#else +static inline void ptdump_check_wx(void) { } +#endif + /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h * so we are sure it is included when arriving here. diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 2e67f9a1430b3..16ada373b32b5 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -396,6 +396,9 @@ void mark_rodata_ro(void) PFN_DOWN((unsigned long)__start_rodata); change_page_attr(page, numpages, PAGE_KERNEL_RO); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 4c6a73782b19f..d2d976ff8a0e2 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -332,6 +332,9 @@ void mark_rodata_ro(void) radix__mark_rodata_ro(); else hash__mark_rodata_ro(); + + // mark_initmem_nx() should have already run by now + ptdump_check_wx(); } void mark_initmem_nx(void) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index e249a56c07bfb..646876d9da640 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -31,7 +31,7 @@ #include "ptdump.h" #ifdef CONFIG_PPC32 -#define KERN_VIRT_START 0 +#define KERN_VIRT_START PAGE_OFFSET #endif /* @@ -68,6 +68,8 @@ struct pg_state { unsigned long last_pa; unsigned int level; u64 current_flags; + bool check_wx; + unsigned long wx_pages; }; struct addr_marker { @@ -181,6 +183,20 @@ static void dump_addr(struct pg_state *st, unsigned long addr) } +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) + return; + + WARN_ONCE(1, "powerpc/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + static void note_page(struct pg_state *st, unsigned long addr, unsigned int level, u64 val) { @@ -210,6 +226,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* Check the PTE flags */ if (st->current_flags) { + note_prot_wx(st, addr); dump_addr(st, addr); /* Dump all the flags */ @@ -387,6 +404,30 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } +#ifdef CONFIG_PPC_DEBUG_WX +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = address_markers, + .check_wx = true, + }; + + if (radix_enabled()) + st.start_address = PAGE_OFFSET; + else + st.start_address = KERN_VIRT_START; + + walk_pagetables(&st); + + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", + st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} +#endif + static int ptdump_init(void) { struct dentry *debugfs_file; From 398af571128fe75f07343f929975b26d57eafd18 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 9 Apr 2019 23:14:20 +1000 Subject: [PATCH 165/205] powerpc/security: Show powerpc_security_features in debugfs This can be helpful for debugging problems with the security feature flags, especially on guests where the flags come from the hypervisor via an hcall and so can't be observed in the device tree. Signed-off-by: Michael Ellerman Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/security.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index b33bafb8fcea1..d6ba696d0ed0b 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -104,6 +104,14 @@ static __init int barrier_nospec_debugfs_init(void) return 0; } device_initcall(barrier_nospec_debugfs_init); + +static __init int security_feature_debugfs_init(void) +{ + debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + (u64 *)&powerpc_security_features); + return 0; +} +device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #ifdef CONFIG_PPC_FSL_BOOK3E From d7fbe2a0439ce6f20917a65990a78c9e747aad34 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Apr 2019 09:08:38 +0000 Subject: [PATCH 166/205] powerpc/prom_init: get rid of PROM_SCRATCH_SIZE PROM_SCRATCH_SIZE is same as sizeof(prom_scratch) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index d3b0d543d9243..523bb99d7676a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -154,10 +154,8 @@ static struct prom_t __prombss prom; static unsigned long __prombss prom_entry; -#define PROM_SCRATCH_SIZE 256 - static char __prombss of_stdout_device[256]; -static char __prombss prom_scratch[PROM_SCRATCH_SIZE]; +static char __prombss prom_scratch[256]; static unsigned long __prombss dt_header_start; static unsigned long __prombss dt_struct_start, dt_struct_end; @@ -1619,8 +1617,8 @@ static void __init prom_init_mem(void) endp = p + (plen / sizeof(cell_t)); #ifdef DEBUG_PROM - memset(path, 0, PROM_SCRATCH_SIZE); - call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); + memset(path, 0, sizeof(prom_scratch)); + call_prom("package-to-path", 3, 1, node, path, sizeof(prom_scratch) - 1); prom_debug(" node %s :\n", path); #endif /* DEBUG_PROM */ @@ -1928,10 +1926,10 @@ static void __init prom_initialize_tce_table(void) local_alloc_bottom = base; /* It seems OF doesn't null-terminate the path :-( */ - memset(path, 0, PROM_SCRATCH_SIZE); + memset(path, 0, sizeof(prom_scratch)); /* Call OF to setup the TCE hardware */ if (call_prom("package-to-path", 3, 1, node, - path, PROM_SCRATCH_SIZE-1) == PROM_ERROR) { + path, sizeof(prom_scratch) - 1) == PROM_ERROR) { prom_printf("package-to-path failed\n"); } @@ -2292,14 +2290,14 @@ static void __init prom_check_displays(void) /* It seems OF doesn't null-terminate the path :-( */ path = prom_scratch; - memset(path, 0, PROM_SCRATCH_SIZE); + memset(path, 0, sizeof(prom_scratch)); /* * leave some room at the end of the path for appending extra * arguments */ if (call_prom("package-to-path", 3, 1, node, path, - PROM_SCRATCH_SIZE-10) == PROM_ERROR) + sizeof(prom_scratch) - 10) == PROM_ERROR) continue; prom_printf("found display : %s, opening... ", path); @@ -2495,8 +2493,8 @@ static void __init scan_dt_build_struct(phandle node, unsigned long *mem_start, /* get it again for debugging */ path = prom_scratch; - memset(path, 0, PROM_SCRATCH_SIZE); - call_prom("package-to-path", 3, 1, node, path, PROM_SCRATCH_SIZE-1); + memset(path, 0, sizeof(prom_scratch)); + call_prom("package-to-path", 3, 1, node, path, sizeof(prom_scratch) - 1); /* get and store all properties */ prev_name = ""; From 32eebf96669568014b79b83a03f7895f3ec8c95f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Horia=20Geant=C4=83?= Date: Wed, 20 Mar 2019 14:55:16 +0200 Subject: [PATCH 167/205] powerpc/dts/fsl: add crypto node alias for B4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit crypto node alias is needed by U-boot to identify the node and perform fix-ups, like adding "fsl,sec-era" property. Signed-off-by: Horia Geantă Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/fsl/b4qds.dtsi | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/boot/dts/fsl/b4qds.dtsi b/arch/powerpc/boot/dts/fsl/b4qds.dtsi index 999efd3bc1674..05be919f35457 100644 --- a/arch/powerpc/boot/dts/fsl/b4qds.dtsi +++ b/arch/powerpc/boot/dts/fsl/b4qds.dtsi @@ -40,6 +40,7 @@ interrupt-parent = <&mpic>; aliases { + crypto = &crypto; phy_sgmii_10 = &phy_sgmii_10; phy_sgmii_11 = &phy_sgmii_11; phy_sgmii_1c = &phy_sgmii_1c; From 90437bffa5f9b1440ba03e023f4875d1814b9360 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 11 Mar 2019 22:47:46 +0000 Subject: [PATCH 168/205] powerpc/entry: Remove unneeded need_resched() loop Since the enabling and disabling of IRQs within preempt_schedule_irq() is contained in a need_resched() loop, we don't need the outer arch code loop. Signed-off-by: Valentin Schneider [mpe: Rebase since CURRENT_THREAD_INFO() removal] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_32.S | 5 +---- arch/powerpc/kernel/entry_64.S | 8 +------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 235a01d34b6d0..c18f3490a77ec 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -906,10 +906,7 @@ resume_kernel: */ bl trace_hardirqs_off #endif -1: bl preempt_schedule_irq - lwz r3,TI_FLAGS(r2) - andi. r0,r3,_TIF_NEED_RESCHED - bne- 1b + bl preempt_schedule_irq #ifdef CONFIG_TRACE_IRQFLAGS /* And now, to properly rebalance the above, we tell lockdep they * are being turned back on, which will happen when we return diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 7cc25389c6bd9..d978af78bf2af 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -865,13 +865,7 @@ resume_kernel: * sure we are soft-disabled first and reconcile irq state. */ RECONCILE_IRQ_STATE(r3,r4) -1: bl preempt_schedule_irq - - /* Re-test flags and eventually loop */ - ld r9, PACA_THREAD_INFO(r13) - ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED - bne 1b + bl preempt_schedule_irq /* * arch_local_irq_restore() from preempt_schedule_irq above may From 5d085ec04a000fefb5182d3b03ee46ca96d8389b Mon Sep 17 00:00:00 2001 From: Bo YU Date: Tue, 30 Oct 2018 09:21:55 -0400 Subject: [PATCH 169/205] powerpc/boot: Fix missing check of lseek() return value This is detected by Coverity scan: CID: 1440481 Signed-off-by: Bo YU Signed-off-by: Michael Ellerman --- arch/powerpc/boot/addnote.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/addnote.c b/arch/powerpc/boot/addnote.c index 9d9f6f334d3cc..3da3e2b1b51bc 100644 --- a/arch/powerpc/boot/addnote.c +++ b/arch/powerpc/boot/addnote.c @@ -223,7 +223,11 @@ main(int ac, char **av) PUT_16(E_PHNUM, np + 2); /* write back */ - lseek(fd, (long) 0, SEEK_SET); + i = lseek(fd, (long) 0, SEEK_SET); + if (i < 0) { + perror("lseek"); + exit(1); + } i = write(fd, buf, n); if (i < 0) { perror("write"); From 0acb5f64560a052fd66ab37b212a72964847160f Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Mon, 15 Apr 2019 22:26:38 -0500 Subject: [PATCH 170/205] powerpc/xmon: add read-only mode Operations which write to memory and special purpose registers should be restricted on systems with integrity guarantees (such as Secure Boot) and, optionally, to avoid self-destructive behaviors. Add a config option, XMON_DEFAULT_RO_MODE, to set default xmon behavior. The kernel cmdline options xmon=ro and xmon=rw override this default. The following xmon operations are affected: memops: disable memmove disable memset disable memzcan memex: no-op'd mwrite super_regs: no-op'd write_spr bpt_cmds: disable proc_call: disable Signed-off-by: Christopher M. Riedl Reviewed-by: Oliver O'Halloran Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 8 ++++++++ arch/powerpc/xmon/xmon.c | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index e9ae650c8e938..c59920920ddc4 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -117,6 +117,14 @@ config XMON_DISASSEMBLY to say Y here, unless you're building for a memory-constrained system. +config XMON_DEFAULT_RO_MODE + bool "Restrict xmon to read-only operations by default" + depends on XMON + default y + help + Operate xmon in read-only mode. The cmdline options 'xmon=rw' and + 'xmon=ro' override this default. + config DEBUGGER bool depends on KGDB || XMON diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e583ed3f6b93a..3e7be19aa2083 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -80,6 +80,7 @@ static int set_indicator_token = RTAS_UNKNOWN_SERVICE; #endif static unsigned long in_xmon __read_mostly = 0; static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT); +static bool xmon_is_ro = IS_ENABLED(CONFIG_XMON_DEFAULT_RO_MODE); static unsigned long adrs; static int size = 1; @@ -202,6 +203,8 @@ static void dump_tlb_book3e(void); #define GETWORD(v) (((v)[0] << 24) + ((v)[1] << 16) + ((v)[2] << 8) + (v)[3]) #endif +static const char *xmon_ro_msg = "Operation disabled: xmon in read-only mode\n"; + static char *help_string = "\ Commands:\n\ b show breakpoints\n\ @@ -989,6 +992,10 @@ cmds(struct pt_regs *excp) memlocate(); break; case 'z': + if (xmon_is_ro) { + printf(xmon_ro_msg); + break; + } memzcan(); break; case 'i': @@ -1042,6 +1049,10 @@ cmds(struct pt_regs *excp) set_lpp_cmd(); break; case 'b': + if (xmon_is_ro) { + printf(xmon_ro_msg); + break; + } bpt_cmds(); break; case 'C': @@ -1055,6 +1066,10 @@ cmds(struct pt_regs *excp) bootcmds(); break; case 'p': + if (xmon_is_ro) { + printf(xmon_ro_msg); + break; + } proccall(); break; case 'P': @@ -1777,6 +1792,11 @@ read_spr(int n, unsigned long *vp) static void write_spr(int n, unsigned long val) { + if (xmon_is_ro) { + printf(xmon_ro_msg); + return; + } + if (setjmp(bus_error_jmp) == 0) { catch_spr_faults = 1; sync(); @@ -2016,6 +2036,12 @@ mwrite(unsigned long adrs, void *buf, int size) char *p, *q; n = 0; + + if (xmon_is_ro) { + printf(xmon_ro_msg); + return n; + } + if (setjmp(bus_error_jmp) == 0) { catch_memory_errors = 1; sync(); @@ -2880,9 +2906,17 @@ memops(int cmd) scanhex((void *)&mcount); switch( cmd ){ case 'm': + if (xmon_is_ro) { + printf(xmon_ro_msg); + break; + } memmove((void *)mdest, (void *)msrc, mcount); break; case 's': + if (xmon_is_ro) { + printf(xmon_ro_msg); + break; + } memset((void *)mdest, mval, mcount); break; case 'd': @@ -3792,6 +3826,14 @@ static int __init early_parse_xmon(char *p) } else if (strncmp(p, "on", 2) == 0) { xmon_init(1); xmon_on = 1; + } else if (strncmp(p, "rw", 2) == 0) { + xmon_init(1); + xmon_on = 1; + xmon_is_ro = false; + } else if (strncmp(p, "ro", 2) == 0) { + xmon_init(1); + xmon_on = 1; + xmon_is_ro = true; } else if (strncmp(p, "off", 3) == 0) xmon_on = 0; else From de269129a48a2d590ba1d20c719e19d86e3ddb3f Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 5 Mar 2019 01:12:19 +0530 Subject: [PATCH 171/205] powerpc/hmi: Fix kernel hang when TB is in error state. On TOD/TB errors timebase register stops/freezes until HMI error recovery gets TOD/TB back into running state. On successful recovery, TB starts running again and udelay() that relies on TB value continues to function properly. But in case when HMI fails to recover from TOD/TB errors, the TB register stay freezed. With TB not running the __delay() function keeps looping and never return. If __delay() is called while in panic path then system hangs and never reboots after panic. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 10 ++++++++++ arch/powerpc/include/asm/opal.h | 2 ++ arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c | 9 +++++++++ arch/powerpc/platforms/powernv/opal-call.c | 1 + arch/powerpc/platforms/powernv/opal.c | 21 +++++++++++++++++++++ arch/powerpc/platforms/powernv/setup.c | 5 ++++- 7 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index e1d118ac61dc8..234fde15b37cd 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -209,6 +209,7 @@ #define OPAL_SENSOR_GROUP_ENABLE 163 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 +#define OPAL_HANDLE_HMI2 166 #define OPAL_NX_COPROC_INIT 167 #define OPAL_XIVE_GET_VP_STATE 170 #define OPAL_LAST 170 @@ -635,6 +636,15 @@ struct OpalHMIEvent { } u; }; +/* OPAL_HANDLE_HMI2 out_flags */ +enum { + OPAL_HMI_FLAGS_TB_RESYNC = (1ull << 0), /* Timebase has been resynced */ + OPAL_HMI_FLAGS_DEC_LOST = (1ull << 1), /* DEC lost, needs to be reprogrammed */ + OPAL_HMI_FLAGS_HDEC_LOST = (1ull << 2), /* HDEC lost, needs to be reprogrammed */ + OPAL_HMI_FLAGS_TOD_TB_FAIL = (1ull << 3), /* TOD/TB recovery failed. */ + OPAL_HMI_FLAGS_NEW_EVENT = (1ull << 63), /* An event has been created */ +}; + enum { OPAL_P7IOC_DIAG_TYPE_NONE = 0, OPAL_P7IOC_DIAG_TYPE_RGC = 1, diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 4e978d4dea5ce..4cc37e708bc73 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -203,6 +203,7 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer, int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data); int64_t opal_sensor_read_u64(u32 sensor_hndl, int token, __be64 *sensor_data); int64_t opal_handle_hmi(void); +int64_t opal_handle_hmi2(__be64 *out_flags); int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end); int64_t opal_unregister_dump_region(uint32_t id); int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val); @@ -359,6 +360,7 @@ int opal_power_control_init(void); extern int opal_machine_check(struct pt_regs *regs); extern bool opal_mce_check_early_recovery(struct pt_regs *regs); extern int opal_hmi_exception_early(struct pt_regs *regs); +extern int opal_hmi_exception_early2(struct pt_regs *regs); extern int opal_handle_hmi_exception(struct pt_regs *regs); extern void opal_shutdown(void); diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54bf7e68a7e19..57e968413d1e0 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -36,6 +36,8 @@ extern unsigned long ppc_proc_freq; extern unsigned long ppc_tb_freq; #define DEFAULT_TB_FREQ 125000000UL +extern bool tb_invalid; + struct div_result { u64 result_high; u64 result_low; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 6ef32472ee1d9..325d60633dfa4 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -150,6 +150,8 @@ EXPORT_SYMBOL_GPL(ppc_proc_freq); unsigned long ppc_tb_freq; EXPORT_SYMBOL_GPL(ppc_tb_freq); +bool tb_invalid; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Factor for converting from cputime_t (timebase ticks) to @@ -459,6 +461,13 @@ void __delay(unsigned long loops) diff += 1000000000; spin_cpu_relax(); } while (diff < loops); + } else if (tb_invalid) { + /* + * TB is in error state and isn't ticking anymore. + * HMI handler was unable to recover from TB error. + * Return immediately, so that kernel won't get stuck here. + */ + spin_cpu_relax(); } else { start = get_tbl(); while (get_tbl() - start < loops) diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 7cba0d5da3ff7..36c8fa3647a2f 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -220,6 +220,7 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); OPAL_CALL(opal_get_param, OPAL_GET_PARAM); OPAL_CALL(opal_set_param, OPAL_SET_PARAM); OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_handle_hmi2, OPAL_HANDLE_HMI2); OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 737c51d634801..f2b063b027f0a 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -614,6 +614,27 @@ int opal_hmi_exception_early(struct pt_regs *regs) return 0; } +int opal_hmi_exception_early2(struct pt_regs *regs) +{ + s64 rc; + __be64 out_flags; + + /* + * call opal hmi handler. + * Check 64-bit flag mask to find out if an event was generated, + * and whether TB is still valid or not etc. + */ + rc = opal_handle_hmi2(&out_flags); + if (rc != OPAL_SUCCESS) + return 0; + + if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT) + local_paca->hmi_event_available = 1; + if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL) + tb_invalid = true; + return 1; +} + /* HMI exception handler called in virtual mode during check_irq_replay. */ int opal_handle_hmi_exception(struct pt_regs *regs) { diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 14befee4b3f1e..3cf40f689aace 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -401,7 +401,10 @@ static void __init pnv_setup_machdep_opal(void) /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; - ppc_md.hmi_exception_early = opal_hmi_exception_early; + if (opal_check_token(OPAL_HANDLE_HMI2)) + ppc_md.hmi_exception_early = opal_hmi_exception_early2; + else + ppc_md.hmi_exception_early = opal_hmi_exception_early; ppc_md.handle_hmi_exception = opal_handle_hmi_exception; } From e1619e89c96c596d72e66f15a0794f5001c8576e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 3 Apr 2019 11:19:26 +1030 Subject: [PATCH 172/205] powerpc/configs: Add (back) MLX5 ethernet support to skiroot_defconfig It turns out that some defconfig changes and kernel config option changes meant we accidentally dropped Ethernet support for Mellanox CLX5 cards. Fixes: cbc39809a398 ("powerpc/configs: Update skiroot defconfig") Reported-by: Carol L Soto Suggested-by: Carol L Soto Signed-off-by: Stewart Smith Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/skiroot_defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 5ba131c30f6bc..6038b9347d9e7 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -163,6 +163,8 @@ CONFIG_S2IO=m CONFIG_MLX4_EN=m # CONFIG_MLX4_CORE_GEN2 is not set CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +# CONFIG_MLX5_EN_RXNFC is not set # CONFIG_NET_VENDOR_MICREL is not set # CONFIG_NET_VENDOR_MICROSEMI is not set CONFIG_MYRI10GE=m From 29b861ea7742e571c1940366944eabc24d612e98 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Mon, 1 Apr 2019 16:41:56 +1030 Subject: [PATCH 173/205] Documentation: powerpc: Expand the DAWR acronym Those not of us not drowning in POWER might not know what this means. Signed-off-by: Joel Stanley Acked-by: Michael Neuling Signed-off-by: Michael Ellerman --- Documentation/powerpc/DAWR-POWER9.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/powerpc/DAWR-POWER9.txt b/Documentation/powerpc/DAWR-POWER9.txt index bdec03650941e..ecdbb076438c4 100644 --- a/Documentation/powerpc/DAWR-POWER9.txt +++ b/Documentation/powerpc/DAWR-POWER9.txt @@ -1,10 +1,10 @@ DAWR issues on POWER9 ============================ -On POWER9 the DAWR can cause a checkstop if it points to cache -inhibited (CI) memory. Currently Linux has no way to disinguish CI -memory when configuring the DAWR, so (for now) the DAWR is disabled by -this commit: +On POWER9 the Data Address Watchpoint Register (DAWR) can cause a checkstop +if it points to cache inhibited (CI) memory. Currently Linux has no way to +disinguish CI memory when configuring the DAWR, so (for now) the DAWR is +disabled by this commit: commit 9654153158d3e0684a1bdb76dbababdb7111d5a0 Author: Michael Neuling From 1e496391a8452101308a23b7395cdd4983b6e5bd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 30 Mar 2017 03:19:25 -0700 Subject: [PATCH 174/205] powerpc/powernv/ioda2: Add __printf format/argument verification Fix fallout too. Signed-off-by: Joe Perches Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 31 ++++++++++++----------- arch/powerpc/platforms/powernv/pci.h | 2 ++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9a9076a5686cd..126602b4e3997 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -847,11 +847,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); if (rc) - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); + pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, bcomp, dcomp, fcomp, OPAL_UNMAP_PE); if (rc) - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc); pe->pbus = NULL; pe->pdev = NULL; @@ -1174,11 +1174,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe->rid = bus->busn_res.start << 8; if (all) - pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n", - bus->busn_res.start, bus->busn_res.end, pe->pe_number); + pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n", + &bus->busn_res.start, &bus->busn_res.end, + pe->pe_number); else - pe_info(pe, "Secondary bus %d associated with PE#%x\n", - bus->busn_res.start, pe->pe_number); + pe_info(pe, "Secondary bus %pad associated with PE#%x\n", + &bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ @@ -1448,7 +1449,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe tbl = pe->table_group.tables[0]; rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + pe_warn(pe, "OPAL error %lld release DMA window\n", rc); pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { @@ -2286,8 +2287,8 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, __pa(addr) + tce32_segsz * i, tce32_segsz, IOMMU_PAGE_SIZE_4K); if (rc) { - pe_err(pe, " Failed to configure 32-bit TCE table," - " err %ld\n", rc); + pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n", + rc); goto fail; } } @@ -2332,9 +2333,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; - pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, - start_addr, start_addr + win_size - 1, - IOMMU_PAGE_SIZE(tbl)); + pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n", + num, start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); /* * Map TCE table through TVT. The TVE index is the PE number @@ -2348,7 +2349,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, size << 3, IOMMU_PAGE_SIZE(tbl)); if (rc) { - pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + pe_err(pe, "Failed to configure TCE table, err %lld\n", rc); return rc; } @@ -3450,7 +3451,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) #ifdef CONFIG_IOMMU_API rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + pe_warn(pe, "OPAL error %lld release DMA window\n", rc); #endif pnv_pci_ioda2_set_bypass(pe, false); @@ -3484,7 +3485,7 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, phb->ioda.reserved_pe_idx, win, 0, idx); if (rc != OPAL_SUCCESS) - pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", + pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n", rc, win, idx); map[idx] = IODA_INVALID_PE; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 8e36da379252a..be26ab3d99e01 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -2,6 +2,7 @@ #ifndef __POWERNV_PCI_H #define __POWERNV_PCI_H +#include /* for __printf */ #include #include #include @@ -204,6 +205,7 @@ extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); +__printf(3, 4) extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); #define pe_err(pe, fmt, ...) \ From 708597daf23486ea6f889ca29cc88389ca9a409a Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 4 Apr 2019 17:24:49 +0530 Subject: [PATCH 175/205] powerpc/perf: init pmu from core-book3s Currenty pmu driver file for each ppc64 generation processor has a __init call in itself. Refactor the code by moving the __init call to core-books.c. This also clean's up compat mode pmu driver registration. Suggested-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan [mpe: Use SPDX tag for license] Signed-off-by: Michael Ellerman --- arch/powerpc/perf/core-book3s.c | 28 ++++++++++++++++++++++++++++ arch/powerpc/perf/internal.h | 11 +++++++++++ arch/powerpc/perf/power5+-pmu.c | 4 +--- arch/powerpc/perf/power5-pmu.c | 4 +--- arch/powerpc/perf/power6-pmu.c | 4 +--- arch/powerpc/perf/power7-pmu.c | 4 +--- arch/powerpc/perf/power8-pmu.c | 3 +-- arch/powerpc/perf/power9-pmu.c | 3 +-- arch/powerpc/perf/ppc970-pmu.c | 4 +--- 9 files changed, 46 insertions(+), 19 deletions(-) create mode 100644 arch/powerpc/perf/internal.h diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index b0723002a3967..a96f9420139ca 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -22,6 +22,10 @@ #include #include +#ifdef CONFIG_PPC64 +#include "internal.h" +#endif + #define BHRB_MAX_ENTRIES 32 #define BHRB_TARGET 0x0000000000000002 #define BHRB_PREDICTION 0x0000000000000001 @@ -2294,3 +2298,27 @@ int register_power_pmu(struct power_pmu *pmu) power_pmu_prepare_cpu, NULL); return 0; } + +#ifdef CONFIG_PPC64 +static int __init init_ppc64_pmu(void) +{ + /* run through all the pmu drivers one at a time */ + if (!init_power5_pmu()) + return 0; + else if (!init_power5p_pmu()) + return 0; + else if (!init_power6_pmu()) + return 0; + else if (!init_power7_pmu()) + return 0; + else if (!init_power8_pmu()) + return 0; + else if (!init_power9_pmu()) + return 0; + else if (!init_ppc970_pmu()) + return 0; + else + return -ENODEV; +} +early_initcall(init_ppc64_pmu); +#endif diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h new file mode 100644 index 0000000000000..683f48117132f --- /dev/null +++ b/arch/powerpc/perf/internal.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright 2019 Madhavan Srinivasan, IBM Corporation. + +extern int init_ppc970_pmu(void); +extern int init_power5_pmu(void); +extern int init_power5p_pmu(void); +extern int init_power6_pmu(void); +extern int init_power7_pmu(void); +extern int init_power8_pmu(void); +extern int init_power9_pmu(void); diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c index 0526dac66007f..9aa803504cb28 100644 --- a/arch/powerpc/perf/power5+-pmu.c +++ b/arch/powerpc/perf/power5+-pmu.c @@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = { .cache_events = &power5p_cache_events, }; -static int __init init_power5p_pmu(void) +int init_power5p_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+") @@ -686,5 +686,3 @@ static int __init init_power5p_pmu(void) return register_power_pmu(&power5p_pmu); } - -early_initcall(init_power5p_pmu); diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c index 4dc99f9f79625..30cb13d081a9b 100644 --- a/arch/powerpc/perf/power5-pmu.c +++ b/arch/powerpc/perf/power5-pmu.c @@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = { .flags = PPMU_HAS_SSLOT, }; -static int __init init_power5_pmu(void) +int init_power5_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5")) @@ -626,5 +626,3 @@ static int __init init_power5_pmu(void) return register_power_pmu(&power5_pmu); } - -early_initcall(init_power5_pmu); diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c index 9c9d646b68a1e..80ec48632cfe0 100644 --- a/arch/powerpc/perf/power6-pmu.c +++ b/arch/powerpc/perf/power6-pmu.c @@ -540,7 +540,7 @@ static struct power_pmu power6_pmu = { .cache_events = &power6_cache_events, }; -static int __init init_power6_pmu(void) +int init_power6_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6")) @@ -548,5 +548,3 @@ static int __init init_power6_pmu(void) return register_power_pmu(&power6_pmu); } - -early_initcall(init_power6_pmu); diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c index 6dbae9884ec43..bb6efd5d25309 100644 --- a/arch/powerpc/perf/power7-pmu.c +++ b/arch/powerpc/perf/power7-pmu.c @@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = { .cache_events = &power7_cache_events, }; -static int __init init_power7_pmu(void) +int init_power7_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7")) @@ -456,5 +456,3 @@ static int __init init_power7_pmu(void) return register_power_pmu(&power7_pmu); } - -early_initcall(init_power7_pmu); diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index d12a2db263535..bcc3409a06de4 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -379,7 +379,7 @@ static struct power_pmu power8_pmu = { .bhrb_nr = 32, }; -static int __init init_power8_pmu(void) +int init_power8_pmu(void) { int rc; @@ -399,4 +399,3 @@ static int __init init_power8_pmu(void) return 0; } -early_initcall(init_power8_pmu); diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 030544e35959f..3a31ac6f48051 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -437,7 +437,7 @@ static struct power_pmu power9_pmu = { .bhrb_nr = 32, }; -static int __init init_power9_pmu(void) +int init_power9_pmu(void) { int rc = 0; unsigned int pvr = mfspr(SPRN_PVR); @@ -467,4 +467,3 @@ static int __init init_power9_pmu(void) return 0; } -early_initcall(init_power9_pmu); diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c index 8b6a8a36fa381..1d33709140222 100644 --- a/arch/powerpc/perf/ppc970-pmu.c +++ b/arch/powerpc/perf/ppc970-pmu.c @@ -490,7 +490,7 @@ static struct power_pmu ppc970_pmu = { .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; -static int __init init_ppc970_pmu(void) +int init_ppc970_pmu(void) { if (!cur_cpu_spec->oprofile_cpu_type || (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970") @@ -499,5 +499,3 @@ static int __init init_ppc970_pmu(void) return register_power_pmu(&ppc970_pmu); } - -early_initcall(init_ppc970_pmu); From be80e758d0c2ec87eceac7676f08c761b4235869 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 4 Apr 2019 17:24:50 +0530 Subject: [PATCH 176/205] powerpc/perf: Add generic compat mode pmu driver Most of the power processor generation performance monitoring unit (PMU) driver code is bundled in the kernel and one of those is enabled/registered based on the oprofile_cpu_type check at the boot. But things get little tricky incase of "compat" mode boot. IBM POWER System Server based processors has a compactibility mode feature, which simpily put is, Nth generation processor (lets say POWER8) will act and appear in a mode consistent with an earlier generation (N-1) processor (that is POWER7). And in this "compat" mode boot, kernel modify the "oprofile_cpu_type" to be Nth generation (POWER8). If Nth generation pmu driver is bundled (POWER8), it gets registered. Key dependency here is to have distro support for latest processor performance monitoring support. Patch here adds a generic "compat-mode" performance monitoring driver to be register in absence of powernv platform specific pmu driver. Driver supports only "cycles" and "instruction" events. "0x0001e" used as event code for "cycles" and "0x00002" used as event code for "instruction" events. New file called "generic-compat-pmu.c" is created to contain the driver specific code. And base raw event code format modeled on PPMU_ARCH_207S. Signed-off-by: Madhavan Srinivasan [mpe: Use SPDX tag for license] Signed-off-by: Michael Ellerman --- arch/powerpc/perf/Makefile | 3 +- arch/powerpc/perf/core-book3s.c | 2 +- arch/powerpc/perf/generic-compat-pmu.c | 234 +++++++++++++++++++++++++ arch/powerpc/perf/internal.h | 1 + 4 files changed, 238 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/perf/generic-compat-pmu.c diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index ab26df5bacb9a..c155dcbb8691c 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -5,7 +5,8 @@ obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o \ - isa207-common.o power8-pmu.o power9-pmu.o + isa207-common.o power8-pmu.o power9-pmu.o \ + generic-compat-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a96f9420139ca..a66fb9c01c9ec 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2318,7 +2318,7 @@ static int __init init_ppc64_pmu(void) else if (!init_ppc970_pmu()) return 0; else - return -ENODEV; + return init_generic_compat_pmu(); } early_initcall(init_ppc64_pmu); #endif diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c new file mode 100644 index 0000000000000..5e5a54d5588e9 --- /dev/null +++ b/arch/powerpc/perf/generic-compat-pmu.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright 2019 Madhavan Srinivasan, IBM Corporation. + +#define pr_fmt(fmt) "generic-compat-pmu: " fmt + +#include "isa207-common.h" + +/* + * Raw event encoding: + * + * 60 56 52 48 44 40 36 32 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * + * 28 24 20 16 12 8 4 0 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ pmc ] [unit ] [ ] m [ pmcxsel ] + * | | + * | *- mark + * | + * | + * *- combine + * + * Below uses IBM bit numbering. + * + * MMCR1[x:y] = unit (PMCxUNIT) + * MMCR1[24] = pmc1combine[0] + * MMCR1[25] = pmc1combine[1] + * MMCR1[26] = pmc2combine[0] + * MMCR1[27] = pmc2combine[1] + * MMCR1[28] = pmc3combine[0] + * MMCR1[29] = pmc3combine[1] + * MMCR1[30] = pmc4combine[0] + * MMCR1[31] = pmc4combine[1] + * + */ + +/* + * Some power9 event codes. + */ +#define EVENT(_name, _code) _name = _code, + +enum { +EVENT(PM_CYC, 0x0001e) +EVENT(PM_INST_CMPL, 0x00002) +}; + +#undef EVENT + +GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); +GENERIC_EVENT_ATTR(instructions, PM_INST_CMPL); + +static struct attribute *generic_compat_events_attr[] = { + GENERIC_EVENT_PTR(PM_CYC), + GENERIC_EVENT_PTR(PM_INST_CMPL), + NULL +}; + +static struct attribute_group generic_compat_pmu_events_group = { + .name = "events", + .attrs = generic_compat_events_attr, +}; + +PMU_FORMAT_ATTR(event, "config:0-19"); +PMU_FORMAT_ATTR(pmcxsel, "config:0-7"); +PMU_FORMAT_ATTR(mark, "config:8"); +PMU_FORMAT_ATTR(combine, "config:10-11"); +PMU_FORMAT_ATTR(unit, "config:12-15"); +PMU_FORMAT_ATTR(pmc, "config:16-19"); + +static struct attribute *generic_compat_pmu_format_attr[] = { + &format_attr_event.attr, + &format_attr_pmcxsel.attr, + &format_attr_mark.attr, + &format_attr_combine.attr, + &format_attr_unit.attr, + &format_attr_pmc.attr, + NULL, +}; + +static struct attribute_group generic_compat_pmu_format_group = { + .name = "format", + .attrs = generic_compat_pmu_format_attr, +}; + +static const struct attribute_group *generic_compat_pmu_attr_groups[] = { + &generic_compat_pmu_format_group, + &generic_compat_pmu_events_group, + NULL, +}; + +static int compat_generic_events[] = { + [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, + [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL, +}; + +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +#undef C + +static struct power_pmu generic_compat_pmu = { + .name = "GENERIC_COMPAT", + .n_counter = MAX_PMU_COUNTERS, + .add_fields = ISA207_ADD_FIELDS, + .test_adder = ISA207_TEST_ADDER, + .compute_mmcr = isa207_compute_mmcr, + .get_constraint = isa207_get_constraint, + .disable_pmc = isa207_disable_pmc, + .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, + .n_generic = ARRAY_SIZE(compat_generic_events), + .generic_events = compat_generic_events, + .cache_events = &generic_compat_cache_events, + .attr_groups = generic_compat_pmu_attr_groups, +}; + +int init_generic_compat_pmu(void) +{ + int rc = 0; + + rc = register_power_pmu(&generic_compat_pmu); + if (rc) + return rc; + + /* Tell userspace that EBB is supported */ + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_EBB; + + return 0; +} diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h index 683f48117132f..f755c64da1377 100644 --- a/arch/powerpc/perf/internal.h +++ b/arch/powerpc/perf/internal.h @@ -9,3 +9,4 @@ extern int init_power6_pmu(void); extern int init_power7_pmu(void); extern int init_power8_pmu(void); extern int init_power9_pmu(void); +extern int init_generic_compat_pmu(void); From 659a6e38db0b422c63fd68ca7e78a8daadca061e Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Mon, 1 Apr 2019 11:50:39 +0530 Subject: [PATCH 177/205] powerpc/perf: Remove PM_BR_CMPL_ALT from power9 event list PM_BR_CMPL_ALT event is not supported, remove it from the power9 event list. Fixes: 24bedcb7c811 ("powerpc/perf: Fix branch event code for power9") Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/power9-events-list.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 063c9d9f25162..6b1dc9a83edef 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -63,8 +63,6 @@ EVENT(PM_RUN_CYC_ALT, 0x200f4) /* Instruction Dispatched */ EVENT(PM_INST_DISP, 0x200f2) EVENT(PM_INST_DISP_ALT, 0x300f2) -/* Alternate Branch event code */ -EVENT(PM_BR_CMPL_ALT, 0x10012) /* Branch event that are not strongly biased */ EVENT(PM_BR_2PATH, 0x20036) /* ALternate branch event that are not strongly biased */ From a913e5e8b43be1d3897a141ce61c1ec071cad89c Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 27 Nov 2018 13:54:52 +0530 Subject: [PATCH 178/205] powerpc/perf: Return accordingly on invalid chip-id in Nest hardware counter memory resides in a per-chip reserve-memory. During nest_imc_event_init(), chip-id of the event-cpu is considered to calculate the base memory addresss for that cpu. Return, proper error condition if the chip_id calculated is invalid. Reported-by: Dan Carpenter Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support") Reviewed-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index b1c37cc3fa98b..6159e9edddfd0 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -487,6 +487,11 @@ static int nest_imc_event_init(struct perf_event *event) * Get the base memory addresss for this cpu. */ chip_id = cpu_to_chip_id(event->cpu); + + /* Return, if chip_id is not valid */ + if (chip_id < 0) + return -ENODEV; + pcni = pmu->mem_info; do { if (pcni->id == chip_id) { From 860b7d2286236170a36f94946d03ca9888d32571 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 18 Dec 2018 11:50:41 +0530 Subject: [PATCH 179/205] powerpc/perf: Fix loop exit condition in nest_imc_event_init The data structure (i.e struct imc_mem_info) to hold the memory address information for nest imc units is allocated based on the number of nodes in the system. nest_imc_event_init() traverse this struct array to calculate the memory base address for the event-cpu. If we fail to find a match for the event cpu's chip-id in imc_mem_info struct array, then the do-while loop will iterate until we crash. Fix this by changing the loop exit condition based on the number of non zero vbase elements in the array, since the allocation is done for nr_chips + 1. Reported-by: Dan Carpenter Fixes: 885dcd709ba91 ("powerpc/perf: Add nest IMC PMU support") Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 2 +- arch/powerpc/platforms/powernv/opal-imc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 6159e9edddfd0..2d12f0037e3a5 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -499,7 +499,7 @@ static int nest_imc_event_init(struct perf_event *event) break; } pcni++; - } while (pcni); + } while (pcni->vbase != 0); if (!flag) return -ENODEV; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a07948c76e7..3d27f02695e41 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -127,7 +127,7 @@ static int imc_get_mem_addr_nest(struct device_node *node, nr_chips)) goto error; - pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info), + pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info), GFP_KERNEL); if (!pmu_ptr->mem_info) goto error; From d1720adff3783a2ba7c128e304a385d18962835b Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 16 Apr 2019 15:18:27 +0530 Subject: [PATCH 180/205] powerpc/include: Add data structures and macros for IMC trace mode Add the macros needed for IMC (In-Memory Collection Counters) trace-mode and data structure to hold the trace-imc record data. Also, add the new type "OPAL_IMC_COUNTERS_TRACE" in 'opal-api.h', since there is a new switch case added in the opal-calls for IMC. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/imc-pmu.h | 39 +++++++++++++++++++++++++++++ arch/powerpc/include/asm/opal-api.h | 1 + 2 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 69f516ecb2fde..7c2ef0e426614 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -33,6 +33,7 @@ */ #define THREAD_IMC_LDBAR_MASK 0x0003ffffffffe000ULL #define THREAD_IMC_ENABLE 0x8000000000000000ULL +#define TRACE_IMC_ENABLE 0x4000000000000000ULL /* * For debugfs interface for imc-mode and imc-command @@ -59,6 +60,34 @@ struct imc_events { char *scale; }; +/* + * Trace IMC hardware updates a 64bytes record on + * Core Performance Monitoring Counter (CPMC) + * overflow. Here is the layout for the trace imc record + * + * DW 0 : Timebase + * DW 1 : Program Counter + * DW 2 : PIDR information + * DW 3 : CPMC1 + * DW 4 : CPMC2 + * DW 5 : CPMC3 + * Dw 6 : CPMC4 + * DW 7 : Timebase + * ..... + * + * The following is the data structure to hold trace imc data. + */ +struct trace_imc_data { + u64 tb1; + u64 ip; + u64 val; + u64 cpmc1; + u64 cpmc2; + u64 cpmc3; + u64 cpmc4; + u64 tb2; +}; + /* Event attribute array index */ #define IMC_FORMAT_ATTR 0 #define IMC_EVENT_ATTR 1 @@ -68,6 +97,13 @@ struct imc_events { /* PMU Format attribute macros */ #define IMC_EVENT_OFFSET_MASK 0xffffffffULL +/* + * Macro to mask bits 0:21 of first double word(which is the timebase) to + * compare with 8th double word (timebase) of trace imc record data. + */ +#define IMC_TRACE_RECORD_TB1_MASK 0x3ffffffffffULL + + /* * Device tree parser code detects IMC pmu support and * registers new IMC pmus. This structure will hold the @@ -113,6 +149,7 @@ struct imc_pmu_ref { enum { IMC_TYPE_THREAD = 0x1, + IMC_TYPE_TRACE = 0x2, IMC_TYPE_CORE = 0x4, IMC_TYPE_CHIP = 0x10, }; @@ -123,6 +160,8 @@ enum { #define IMC_DOMAIN_NEST 1 #define IMC_DOMAIN_CORE 2 #define IMC_DOMAIN_THREAD 3 +/* For trace-imc the domain is still thread but it operates in trace-mode */ +#define IMC_DOMAIN_TRACE 4 extern int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 234fde15b37cd..e1577cfa71862 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1129,6 +1129,7 @@ enum { enum { OPAL_IMC_COUNTERS_NEST = 1, OPAL_IMC_COUNTERS_CORE = 2, + OPAL_IMC_COUNTERS_TRACE = 3, }; From dd50cf7cbc7bdd86483b797ac3d27b37d5aeeaa4 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 16 Apr 2019 15:18:28 +0530 Subject: [PATCH 181/205] powerpc/perf: Rearrange setting of ldbar for thread-imc LDBAR holds the memory address allocated for each cpu. For thread-imc the mode bit (i.e bit 1) of LDBAR is set to accumulation. Currently, ldbar is loaded with per cpu memory address and mode set to accumulation at boot time. To enable trace-imc, the mode bit of ldbar should be set to 'trace'. So to accommodate trace-mode of IMC, reposition setting of ldbar for thread-imc to thread_imc_event_add(). Also reset ldbar at thread_imc_event_del(). Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 2d12f0037e3a5..23092a359ce07 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -793,8 +793,11 @@ static int core_imc_event_init(struct perf_event *event) } /* - * Allocates a page of memory for each of the online cpus, and write the - * physical base address of that page to the LDBAR for that cpu. + * Allocates a page of memory for each of the online cpus, and load + * LDBAR with 0. + * The physical base address of the page allocated for a cpu will be + * written to the LDBAR for that cpu, when the thread-imc event + * is added. * * LDBAR Register Layout: * @@ -812,7 +815,7 @@ static int core_imc_event_init(struct perf_event *event) */ static int thread_imc_mem_alloc(int cpu_id, int size) { - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + u64 *local_mem = per_cpu(thread_imc_mem, cpu_id); int nid = cpu_to_node(cpu_id); if (!local_mem) { @@ -829,9 +832,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - - mtspr(SPRN_LDBAR, ldbar_value); + mtspr(SPRN_LDBAR, 0); return 0; } @@ -982,6 +983,7 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -990,6 +992,9 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. @@ -1021,11 +1026,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id; struct imc_pmu_ref *ref; - /* - * Take a snapshot and calculate the delta and update - * the event counter values. - */ - imc_event_update(event); + mtspr(SPRN_LDBAR, 0); core_id = smp_processor_id() / threads_per_core; ref = &core_imc_refc[core_id]; @@ -1044,6 +1045,11 @@ static void thread_imc_event_del(struct perf_event *event, int flags) ref->refc = 0; } mutex_unlock(&ref->lock); + /* + * Take a snapshot and calculate the delta and update + * the event counter values. + */ + imc_event_update(event); } /* update_pmu_ops : Populate the appropriate operations for "pmu" */ From 216c3087a346db8d7c8a064d2b8f0f49e4694934 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Tue, 16 Apr 2019 15:18:29 +0530 Subject: [PATCH 182/205] powerpc/perf: Add privileged access check for thread_imc Add code to restrict user access to thread_imc pmu since some event report privilege level information. Fixes: f74c89bd80fb3 ("powerpc/perf: Add thread IMC PMU support") Signed-off-by: Madhavan Srinivasan Signed-off-by: Anju T Sudhakar Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 23092a359ce07..975837d85a80f 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -864,6 +864,9 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + /* Sampling not supported */ if (event->hw.sample_period) return -EINVAL; From 72c69dcddce103338de558c5c6e9ef9e4f607ce1 Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 16 Apr 2019 15:18:30 +0530 Subject: [PATCH 183/205] powerpc/perf: Trace imc events detection and cpuhotplug Patch detects trace-imc events, does memory initilizations for each online cpu, and registers cpuhotplug call-backs. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 104 ++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-imc.c | 3 + include/linux/cpuhotplug.h | 1 + 3 files changed, 108 insertions(+) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 975837d85a80f..3cc5e1934f0c9 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -43,6 +43,11 @@ static DEFINE_PER_CPU(u64 *, thread_imc_mem); static struct imc_pmu *thread_imc_pmu; static int thread_imc_mem_size; +/* Trace IMC data structures */ +static DEFINE_PER_CPU(u64 *, trace_imc_mem); +static struct imc_pmu_ref *trace_imc_refc; +static int trace_imc_mem_size; + static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) { return container_of(event->pmu, struct imc_pmu, pmu); @@ -1055,6 +1060,59 @@ static void thread_imc_event_del(struct perf_event *event, int flags) imc_event_update(event); } +/* + * Allocate a page of memory for each cpu, and load LDBAR with 0. + */ +static int trace_imc_mem_alloc(int cpu_id, int size) +{ + u64 *local_mem = per_cpu(trace_imc_mem, cpu_id); + int phys_id = cpu_to_node(cpu_id), rc = 0; + int core_id = (cpu_id / threads_per_core); + + if (!local_mem) { + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + per_cpu(trace_imc_mem, cpu_id) = local_mem; + + /* Initialise the counters for trace mode */ + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_TRACE, __pa((void *)local_mem), + get_hard_smp_processor_id(cpu_id)); + if (rc) { + pr_info("IMC:opal init failed for trace imc\n"); + return rc; + } + } + + /* Init the mutex, if not already */ + trace_imc_refc[core_id].id = core_id; + mutex_init(&trace_imc_refc[core_id].lock); + + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int ppc_trace_imc_cpu_online(unsigned int cpu) +{ + return trace_imc_mem_alloc(cpu, trace_imc_mem_size); +} + +static int ppc_trace_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int trace_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, + "perf/powerpc/imc_trace:online", + ppc_trace_imc_cpu_online, + ppc_trace_imc_cpu_offline); +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1177,6 +1235,18 @@ static void cleanup_all_thread_imc_memory(void) } } +static void cleanup_all_trace_imc_memory(void) +{ + int i, order = get_order(trace_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(trace_imc_mem, i)) + free_pages((u64)per_cpu(trace_imc_mem, i), order); + + } + kfree(trace_imc_refc); +} + /* Function to free the attr_groups which are dynamically allocated */ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) { @@ -1218,6 +1288,11 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); cleanup_all_thread_imc_memory(); } + + if (pmu_ptr->domain == IMC_DOMAIN_TRACE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE); + cleanup_all_trace_imc_memory(); + } } /* @@ -1300,6 +1375,27 @@ static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, thread_imc_pmu = pmu_ptr; break; + case IMC_DOMAIN_TRACE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + nr_cores = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); + trace_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), + GFP_KERNEL); + if (!trace_imc_refc) + return -ENOMEM; + + trace_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = trace_imc_mem_alloc(cpu, trace_imc_mem_size); + if (res) { + cleanup_all_trace_imc_memory(); + goto err; + } + } + break; default: return -EINVAL; } @@ -1372,6 +1468,14 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id goto err_free_mem; } + break; + case IMC_DOMAIN_TRACE: + ret = trace_imc_cpu_init(); + if (ret) { + cleanup_all_trace_imc_memory(); + goto err_free_mem; + } + break; default: return -EINVAL; /* Unknown domain */ diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 3d27f02695e41..3e497b91d2100 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -284,6 +284,9 @@ static int opal_imc_counters_probe(struct platform_device *pdev) case IMC_TYPE_THREAD: domain = IMC_DOMAIN_THREAD; break; + case IMC_TYPE_TRACE: + domain = IMC_DOMAIN_TRACE; + break; default: pr_warn("IMC Unknown Device type \n"); domain = -1; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e78281d07b70b..c3413d9b8348a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -170,6 +170,7 @@ enum cpuhp_state { CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, + CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, From 012ae244845f19d5f6ca2a90426851bc5044a0dc Mon Sep 17 00:00:00 2001 From: Anju T Sudhakar Date: Tue, 16 Apr 2019 15:18:31 +0530 Subject: [PATCH 184/205] powerpc/perf: Trace imc PMU functions Add PMU functions to support trace-imc. Signed-off-by: Anju T Sudhakar Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman --- arch/powerpc/perf/imc-pmu.c | 205 +++++++++++++++++++++++++++++++++++- 1 file changed, 204 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 3cc5e1934f0c9..31fa753e2eb2c 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -53,7 +53,7 @@ static struct imc_pmu *imc_event_to_pmu(struct perf_event *event) return container_of(event->pmu, struct imc_pmu, pmu); } -PMU_FORMAT_ATTR(event, "config:0-40"); +PMU_FORMAT_ATTR(event, "config:0-61"); PMU_FORMAT_ATTR(offset, "config:0-31"); PMU_FORMAT_ATTR(rvalue, "config:32"); PMU_FORMAT_ATTR(mode, "config:33-40"); @@ -70,6 +70,25 @@ static struct attribute_group imc_format_group = { .attrs = imc_format_attrs, }; +/* Format attribute for imc trace-mode */ +PMU_FORMAT_ATTR(cpmc_reserved, "config:0-19"); +PMU_FORMAT_ATTR(cpmc_event, "config:20-27"); +PMU_FORMAT_ATTR(cpmc_samplesel, "config:28-29"); +PMU_FORMAT_ATTR(cpmc_load, "config:30-61"); +static struct attribute *trace_imc_format_attrs[] = { + &format_attr_event.attr, + &format_attr_cpmc_reserved.attr, + &format_attr_cpmc_event.attr, + &format_attr_cpmc_samplesel.attr, + &format_attr_cpmc_load.attr, + NULL, +}; + +static struct attribute_group trace_imc_format_group = { +.name = "format", +.attrs = trace_imc_format_attrs, +}; + /* Get the cpumask printed to a buffer "buf" */ static ssize_t imc_pmu_cpumask_get_attr(struct device *dev, struct device_attribute *attr, @@ -1113,6 +1132,182 @@ static int trace_imc_cpu_init(void) ppc_trace_imc_cpu_offline); } +static u64 get_trace_imc_event_base_addr(void) +{ + return (u64)per_cpu(trace_imc_mem, smp_processor_id()); +} + +/* + * Function to parse trace-imc data obtained + * and to prepare the perf sample. + */ +static int trace_imc_prepare_sample(struct trace_imc_data *mem, + struct perf_sample_data *data, + u64 *prev_tb, + struct perf_event_header *header, + struct perf_event *event) +{ + /* Sanity checks for a valid record */ + if (be64_to_cpu(READ_ONCE(mem->tb1)) > *prev_tb) + *prev_tb = be64_to_cpu(READ_ONCE(mem->tb1)); + else + return -EINVAL; + + if ((be64_to_cpu(READ_ONCE(mem->tb1)) & IMC_TRACE_RECORD_TB1_MASK) != + be64_to_cpu(READ_ONCE(mem->tb2))) + return -EINVAL; + + /* Prepare perf sample */ + data->ip = be64_to_cpu(READ_ONCE(mem->ip)); + data->period = event->hw.last_period; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + header->misc = 0; + + if (is_kernel_addr(data->ip)) + header->misc |= PERF_RECORD_MISC_KERNEL; + else + header->misc |= PERF_RECORD_MISC_USER; + + perf_event_header__init_id(header, data, event); + + return 0; +} + +static void dump_trace_imc_data(struct perf_event *event) +{ + struct trace_imc_data *mem; + int i, ret; + u64 prev_tb = 0; + + mem = (struct trace_imc_data *)get_trace_imc_event_base_addr(); + for (i = 0; i < (trace_imc_mem_size / sizeof(struct trace_imc_data)); + i++, mem++) { + struct perf_sample_data data; + struct perf_event_header header; + + ret = trace_imc_prepare_sample(mem, &data, &prev_tb, &header, event); + if (ret) /* Exit, if not a valid record */ + break; + else { + /* If this is a valid record, create the sample */ + struct perf_output_handle handle; + + if (perf_output_begin(&handle, event, header.size)) + return; + + perf_output_sample(&handle, &header, &data, event); + perf_output_end(&handle); + } + } +} + +static int trace_imc_event_add(struct perf_event *event, int flags) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref = NULL; + u64 local_mem, ldbar_value; + + /* Set trace-imc bit in ldbar and load ldbar with per-thread memory address */ + local_mem = get_trace_imc_event_base_addr(); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | TRACE_IMC_ENABLE; + + if (core_imc_refc) + ref = &core_imc_refc[core_id]; + if (!ref) { + /* If core-imc is not enabled, use trace-imc reference count */ + if (trace_imc_refc) + ref = &trace_imc_refc[core_id]; + if (!ref) + return -EINVAL; + } + mtspr(SPRN_LDBAR, ldbar_value); + mutex_lock(&ref->lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); + mtspr(SPRN_LDBAR, 0); + return -EINVAL; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + + return 0; +} + +static void trace_imc_event_read(struct perf_event *event) +{ + return; +} + +static void trace_imc_event_stop(struct perf_event *event, int flags) +{ + u64 local_mem = get_trace_imc_event_base_addr(); + dump_trace_imc_data(event); + memset((void *)local_mem, 0, sizeof(u64)); +} + +static void trace_imc_event_start(struct perf_event *event, int flags) +{ + return; +} + +static void trace_imc_event_del(struct perf_event *event, int flags) +{ + int core_id = smp_processor_id() / threads_per_core; + struct imc_pmu_ref *ref = NULL; + + if (core_imc_refc) + ref = &core_imc_refc[core_id]; + if (!ref) { + /* If core-imc is not enabled, use trace-imc reference count */ + if (trace_imc_refc) + ref = &trace_imc_refc[core_id]; + if (!ref) + return; + } + mtspr(SPRN_LDBAR, 0); + mutex_lock(&ref->lock); + ref->refc--; + if (ref->refc == 0) { + if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id); + return; + } + } else if (ref->refc < 0) { + ref->refc = 0; + } + mutex_unlock(&ref->lock); + trace_imc_event_stop(event, flags); +} + +static int trace_imc_event_init(struct perf_event *event) +{ + struct task_struct *target; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* Return if this is a couting event */ + if (event->attr.sample_period == 0) + return -ENOENT; + + event->hw.idx = -1; + target = event->hw.target; + + event->pmu->task_ctx_nr = perf_hw_context; + return 0; +} + /* update_pmu_ops : Populate the appropriate operations for "pmu" */ static int update_pmu_ops(struct imc_pmu *pmu) { @@ -1143,6 +1338,14 @@ static int update_pmu_ops(struct imc_pmu *pmu) pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn; pmu->pmu.commit_txn = thread_imc_pmu_commit_txn; break; + case IMC_DOMAIN_TRACE: + pmu->pmu.event_init = trace_imc_event_init; + pmu->pmu.add = trace_imc_event_add; + pmu->pmu.del = trace_imc_event_del; + pmu->pmu.start = trace_imc_event_start; + pmu->pmu.stop = trace_imc_event_stop; + pmu->pmu.read = trace_imc_event_read; + pmu->attr_groups[IMC_FORMAT_ATTR] = &trace_imc_format_group; default: break; } From 9c4ae0645682b97437072693f0edbee17214225b Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 25 Mar 2019 16:34:52 +1100 Subject: [PATCH 185/205] ocxl: Rename struct link to ocxl_link The term 'link' is ambiguous (especially when the struct is used for a list), so rename it for clarity. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/file.c | 5 ++--- drivers/misc/ocxl/link.c | 36 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index e6a607488f8af..009e09b7ded5e 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -151,10 +151,9 @@ static long afu_ioctl_enable_p9_wait(struct ocxl_context *ctx, mutex_unlock(&ctx->status_mutex); if (status == ATTACHED) { - int rc; - struct link *link = ctx->afu->fn->link; + int rc = ocxl_link_update_pe(ctx->afu->fn->link, + ctx->pasid, ctx->tidr); - rc = ocxl_link_update_pe(link, ctx->pasid, ctx->tidr); if (rc) return rc; } diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index c2eaa5d64176c..cce5b0d645059 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -76,7 +76,7 @@ struct spa { * limited number of opencapi slots on a system and lookup is only * done when the device is probed */ -struct link { +struct ocxl_link { struct list_head list; struct kref ref; int domain; @@ -179,7 +179,7 @@ static void xsl_fault_handler_bh(struct work_struct *fault_work) static irqreturn_t xsl_fault_handler(int irq, void *data) { - struct link *link = (struct link *) data; + struct ocxl_link *link = (struct ocxl_link *) data; struct spa *spa = link->spa; u64 dsisr, dar, pe_handle; struct pe_data *pe_data; @@ -254,7 +254,7 @@ static int map_irq_registers(struct pci_dev *dev, struct spa *spa) &spa->reg_tfc, &spa->reg_pe_handle); } -static int setup_xsl_irq(struct pci_dev *dev, struct link *link) +static int setup_xsl_irq(struct pci_dev *dev, struct ocxl_link *link) { struct spa *spa = link->spa; int rc; @@ -309,7 +309,7 @@ static int setup_xsl_irq(struct pci_dev *dev, struct link *link) return rc; } -static void release_xsl_irq(struct link *link) +static void release_xsl_irq(struct ocxl_link *link) { struct spa *spa = link->spa; @@ -321,7 +321,7 @@ static void release_xsl_irq(struct link *link) unmap_irq_registers(spa); } -static int alloc_spa(struct pci_dev *dev, struct link *link) +static int alloc_spa(struct pci_dev *dev, struct ocxl_link *link) { struct spa *spa; @@ -348,7 +348,7 @@ static int alloc_spa(struct pci_dev *dev, struct link *link) return 0; } -static void free_spa(struct link *link) +static void free_spa(struct ocxl_link *link) { struct spa *spa = link->spa; @@ -362,12 +362,12 @@ static void free_spa(struct link *link) } } -static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link) +static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link) { - struct link *link; + struct ocxl_link *link; int rc; - link = kzalloc(sizeof(struct link), GFP_KERNEL); + link = kzalloc(sizeof(struct ocxl_link), GFP_KERNEL); if (!link) return -ENOMEM; @@ -403,7 +403,7 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link) return rc; } -static void free_link(struct link *link) +static void free_link(struct ocxl_link *link) { release_xsl_irq(link); free_spa(link); @@ -413,7 +413,7 @@ static void free_link(struct link *link) int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle) { int rc = 0; - struct link *link; + struct ocxl_link *link; mutex_lock(&links_list_lock); list_for_each_entry(link, &links_list, list) { @@ -440,7 +440,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_setup); static void release_xsl(struct kref *ref) { - struct link *link = container_of(ref, struct link, ref); + struct ocxl_link *link = container_of(ref, struct ocxl_link, ref); list_del(&link->list); /* call platform code before releasing data */ @@ -450,7 +450,7 @@ static void release_xsl(struct kref *ref) void ocxl_link_release(struct pci_dev *dev, void *link_handle) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; mutex_lock(&links_list_lock); kref_put(&link->ref, release_xsl); @@ -486,7 +486,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), void *xsl_err_data) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; int pe_handle, rc = 0; @@ -556,7 +556,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_add_pe); int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; int pe_handle, rc; @@ -592,7 +592,7 @@ int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid) int ocxl_link_remove_pe(void *link_handle, int pasid) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; struct spa *spa = link->spa; struct ocxl_process_element *pe; struct pe_data *pe_data; @@ -664,7 +664,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_remove_pe); int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; int rc, irq; u64 addr; @@ -685,7 +685,7 @@ EXPORT_SYMBOL_GPL(ocxl_link_irq_alloc); void ocxl_link_free_irq(void *link_handle, int hw_irq) { - struct link *link = (struct link *) link_handle; + struct ocxl_link *link = (struct ocxl_link *) link_handle; pnv_ocxl_free_xive_irq(hw_irq); atomic_inc(&link->irq_available); From 32941494ff9a9f78fc967adf0e03044b62e09114 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 25 Mar 2019 16:34:53 +1100 Subject: [PATCH 186/205] ocxl: read_pasid never returns an error, so make it void No need for a return value in read_pasid as it only returns 0. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/config.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index 8f2c5d8bd2eee..4dc11897237df 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -68,7 +68,7 @@ static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx) return 0; } -static int read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn) +static void read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn) { u16 val; int pos; @@ -89,7 +89,6 @@ static int read_pasid(struct pci_dev *dev, struct ocxl_fn_config *fn) out: dev_dbg(&dev->dev, "PASID capability:\n"); dev_dbg(&dev->dev, " Max PASID log = %d\n", fn->max_pasid_log); - return 0; } static int read_dvsec_tl(struct pci_dev *dev, struct ocxl_fn_config *fn) @@ -205,11 +204,7 @@ int ocxl_config_read_function(struct pci_dev *dev, struct ocxl_fn_config *fn) { int rc; - rc = read_pasid(dev, fn); - if (rc) { - dev_err(&dev->dev, "Invalid PASID configuration: %d\n", rc); - return -ENODEV; - } + read_pasid(dev, fn); rc = read_dvsec_tl(dev, fn); if (rc) { From 53e3e74530626d708d4822a29a3edd16f7484b4b Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 25 Mar 2019 16:34:54 +1100 Subject: [PATCH 187/205] ocxl: Remove superfluous 'extern' from headers The 'extern' keyword adds no value here. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/ocxl_internal.h | 54 +++++++++++++++---------------- include/misc/ocxl.h | 36 ++++++++++----------- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index a32f2151029f6..321b29e77f452 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -16,7 +16,6 @@ extern struct pci_driver ocxl_pci_driver; - struct ocxl_fn { struct device dev; int bar_used[3]; @@ -92,41 +91,40 @@ struct ocxl_process_element { __be32 software_state; }; +struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu); +void ocxl_afu_put(struct ocxl_afu *afu); -extern struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu); -extern void ocxl_afu_put(struct ocxl_afu *afu); - -extern int ocxl_create_cdev(struct ocxl_afu *afu); -extern void ocxl_destroy_cdev(struct ocxl_afu *afu); -extern int ocxl_register_afu(struct ocxl_afu *afu); -extern void ocxl_unregister_afu(struct ocxl_afu *afu); +int ocxl_create_cdev(struct ocxl_afu *afu); +void ocxl_destroy_cdev(struct ocxl_afu *afu); +int ocxl_register_afu(struct ocxl_afu *afu); +void ocxl_unregister_afu(struct ocxl_afu *afu); -extern int ocxl_file_init(void); -extern void ocxl_file_exit(void); +int ocxl_file_init(void); +void ocxl_file_exit(void); -extern int ocxl_pasid_afu_alloc(struct ocxl_fn *fn, u32 size); -extern void ocxl_pasid_afu_free(struct ocxl_fn *fn, u32 start, u32 size); -extern int ocxl_actag_afu_alloc(struct ocxl_fn *fn, u32 size); -extern void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size); +int ocxl_pasid_afu_alloc(struct ocxl_fn *fn, u32 size); +void ocxl_pasid_afu_free(struct ocxl_fn *fn, u32 start, u32 size); +int ocxl_actag_afu_alloc(struct ocxl_fn *fn, u32 size); +void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size); -extern struct ocxl_context *ocxl_context_alloc(void); -extern int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, +struct ocxl_context *ocxl_context_alloc(void); +int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, struct address_space *mapping); -extern int ocxl_context_attach(struct ocxl_context *ctx, u64 amr); -extern int ocxl_context_mmap(struct ocxl_context *ctx, +int ocxl_context_attach(struct ocxl_context *ctx, u64 amr); +int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma); -extern int ocxl_context_detach(struct ocxl_context *ctx); -extern void ocxl_context_detach_all(struct ocxl_afu *afu); -extern void ocxl_context_free(struct ocxl_context *ctx); +int ocxl_context_detach(struct ocxl_context *ctx); +void ocxl_context_detach_all(struct ocxl_afu *afu); +void ocxl_context_free(struct ocxl_context *ctx); -extern int ocxl_sysfs_add_afu(struct ocxl_afu *afu); -extern void ocxl_sysfs_remove_afu(struct ocxl_afu *afu); +int ocxl_sysfs_add_afu(struct ocxl_afu *afu); +void ocxl_sysfs_remove_afu(struct ocxl_afu *afu); -extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset); -extern int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset); -extern void ocxl_afu_irq_free_all(struct ocxl_context *ctx); -extern int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, +int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset); +int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset); +void ocxl_afu_irq_free_all(struct ocxl_context *ctx); +int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd); -extern u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset); +u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset); #endif /* _OCXL_INTERNAL_H_ */ diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index 9ff6ddc28e221..4544573cc93c1 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -53,7 +53,7 @@ struct ocxl_fn_config { * Read the configuration space of a function and fill in a * ocxl_fn_config structure with all the function details */ -extern int ocxl_config_read_function(struct pci_dev *dev, +int ocxl_config_read_function(struct pci_dev *dev, struct ocxl_fn_config *fn); /* @@ -62,14 +62,14 @@ extern int ocxl_config_read_function(struct pci_dev *dev, * AFU indexes can be sparse, so a driver should check all indexes up * to the maximum found in the function description */ -extern int ocxl_config_check_afu_index(struct pci_dev *dev, +int ocxl_config_check_afu_index(struct pci_dev *dev, struct ocxl_fn_config *fn, int afu_idx); /* * Read the configuration space of a function for the AFU specified by * the index 'afu_idx'. Fills in a ocxl_afu_config structure */ -extern int ocxl_config_read_afu(struct pci_dev *dev, +int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_fn_config *fn, struct ocxl_afu_config *afu, u8 afu_idx); @@ -77,7 +77,7 @@ extern int ocxl_config_read_afu(struct pci_dev *dev, /* * Get the max PASID value that can be used by the function */ -extern int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); +int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); /* * Tell an AFU, by writing in the configuration space, the PASIDs that @@ -87,7 +87,7 @@ extern int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); * 'afu_control_offset' is the offset of the AFU control DVSEC which * can be found in the function configuration */ -extern void ocxl_config_set_afu_pasid(struct pci_dev *dev, +void ocxl_config_set_afu_pasid(struct pci_dev *dev, int afu_control_offset, int pasid_base, u32 pasid_count_log); @@ -98,7 +98,7 @@ extern void ocxl_config_set_afu_pasid(struct pci_dev *dev, * 'supported' is the total number of actags desired by all the AFUs * of the function. */ -extern int ocxl_config_get_actag_info(struct pci_dev *dev, +int ocxl_config_get_actag_info(struct pci_dev *dev, u16 *base, u16 *enabled, u16 *supported); /* @@ -108,7 +108,7 @@ extern int ocxl_config_get_actag_info(struct pci_dev *dev, * 'func_offset' is the offset of the Function DVSEC that can found in * the function configuration */ -extern void ocxl_config_set_actag(struct pci_dev *dev, int func_offset, +void ocxl_config_set_actag(struct pci_dev *dev, int func_offset, u32 actag_base, u32 actag_count); /* @@ -118,7 +118,7 @@ extern void ocxl_config_set_actag(struct pci_dev *dev, int func_offset, * 'afu_control_offset' is the offset of the AFU control DVSEC for the * desired AFU. It can be found in the AFU configuration */ -extern void ocxl_config_set_afu_actag(struct pci_dev *dev, +void ocxl_config_set_afu_actag(struct pci_dev *dev, int afu_control_offset, int actag_base, int actag_count); @@ -128,7 +128,7 @@ extern void ocxl_config_set_afu_actag(struct pci_dev *dev, * 'afu_control_offset' is the offset of the AFU control DVSEC for the * desired AFU. It can be found in the AFU configuration */ -extern void ocxl_config_set_afu_state(struct pci_dev *dev, +void ocxl_config_set_afu_state(struct pci_dev *dev, int afu_control_offset, int enable); /* @@ -139,7 +139,7 @@ extern void ocxl_config_set_afu_state(struct pci_dev *dev, * between the host and device, and set the Transaction Layer on both * accordingly. */ -extern int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec); +int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec); /* * Request an AFU to terminate a PASID. @@ -152,7 +152,7 @@ extern int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec); * 'afu_control_offset' is the offset of the AFU control DVSEC for the * desired AFU. It can be found in the AFU configuration */ -extern int ocxl_config_terminate_pasid(struct pci_dev *dev, +int ocxl_config_terminate_pasid(struct pci_dev *dev, int afu_control_offset, int pasid); /* @@ -165,13 +165,13 @@ extern int ocxl_config_terminate_pasid(struct pci_dev *dev, * Returns a 'link handle' that should be used for further calls for * the link */ -extern int ocxl_link_setup(struct pci_dev *dev, int PE_mask, +int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle); /* * Remove the association between the function and its link. */ -extern void ocxl_link_release(struct pci_dev *dev, void *link_handle); +void ocxl_link_release(struct pci_dev *dev, void *link_handle); /* * Add a Process Element to the Shared Process Area for a link. @@ -183,7 +183,7 @@ extern void ocxl_link_release(struct pci_dev *dev, void *link_handle); * 'xsl_err_data' is an argument passed to the above callback, if * defined */ -extern int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, +int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, u64 amr, struct mm_struct *mm, void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), void *xsl_err_data); @@ -195,12 +195,12 @@ extern int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, * pasid: the PASID for the AFU context * tid: the new thread id for the process element */ -extern int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid); +int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid); /* * Remove a Process Element from the Shared Process Area for a link */ -extern int ocxl_link_remove_pe(void *link_handle, int pasid); +int ocxl_link_remove_pe(void *link_handle, int pasid); /* * Allocate an AFU interrupt associated to the link. @@ -212,12 +212,12 @@ extern int ocxl_link_remove_pe(void *link_handle, int pasid); * interrupt. It is an MMIO address which needs to be remapped (one * page). */ -extern int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, +int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *obj_handle); /* * Free a previously allocated AFU interrupt */ -extern void ocxl_link_free_irq(void *link_handle, int hw_irq); +void ocxl_link_free_irq(void *link_handle, int hw_irq); #endif /* _MISC_OCXL_H_ */ From b696d28283e2e030cdc9ece7a4eb0a93b4c474fd Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Mon, 25 Mar 2019 16:34:55 +1100 Subject: [PATCH 188/205] ocxl: Remove some unused exported symbols Remove some unused exported symbols. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/config.c | 4 +--- drivers/misc/ocxl/ocxl_internal.h | 23 +++++++++++++++++++++++ include/misc/ocxl.h | 23 ----------------------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index 4dc11897237df..5e65acb8e1347 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -2,8 +2,8 @@ // Copyright 2017 IBM Corp. #include #include -#include #include +#include "ocxl_internal.h" #define EXTRACT_BIT(val, bit) (!!(val & BIT(bit))) #define EXTRACT_BITS(val, s, e) ((val & GENMASK(e, s)) >> s) @@ -299,7 +299,6 @@ int ocxl_config_check_afu_index(struct pci_dev *dev, } return 1; } -EXPORT_SYMBOL_GPL(ocxl_config_check_afu_index); static int read_afu_name(struct pci_dev *dev, struct ocxl_fn_config *fn, struct ocxl_afu_config *afu) @@ -535,7 +534,6 @@ int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count) { return pnv_ocxl_get_pasid_count(dev, count); } -EXPORT_SYMBOL_GPL(ocxl_config_get_pasid_info); void ocxl_config_set_afu_pasid(struct pci_dev *dev, int pos, int pasid_base, u32 pasid_count_log) diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 321b29e77f452..06fd98c989c8d 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -107,6 +107,29 @@ void ocxl_pasid_afu_free(struct ocxl_fn *fn, u32 start, u32 size); int ocxl_actag_afu_alloc(struct ocxl_fn *fn, u32 size); void ocxl_actag_afu_free(struct ocxl_fn *fn, u32 start, u32 size); +/* + * Get the max PASID value that can be used by the function + */ +int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); + +/* + * Check if an AFU index is valid for the given function. + * + * AFU indexes can be sparse, so a driver should check all indexes up + * to the maximum found in the function description + */ +int ocxl_config_check_afu_index(struct pci_dev *dev, + struct ocxl_fn_config *fn, int afu_idx); + +/** + * Update values within a Process Element + * + * link_handle: the link handle associated with the process element + * pasid: the PASID for the AFU context + * tid: the new thread id for the process element + */ +int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid); + struct ocxl_context *ocxl_context_alloc(void); int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, struct address_space *mapping); diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index 4544573cc93c1..9530d3be1b305 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -56,15 +56,6 @@ struct ocxl_fn_config { int ocxl_config_read_function(struct pci_dev *dev, struct ocxl_fn_config *fn); -/* - * Check if an AFU index is valid for the given function. - * - * AFU indexes can be sparse, so a driver should check all indexes up - * to the maximum found in the function description - */ -int ocxl_config_check_afu_index(struct pci_dev *dev, - struct ocxl_fn_config *fn, int afu_idx); - /* * Read the configuration space of a function for the AFU specified by * the index 'afu_idx'. Fills in a ocxl_afu_config structure @@ -74,11 +65,6 @@ int ocxl_config_read_afu(struct pci_dev *dev, struct ocxl_afu_config *afu, u8 afu_idx); -/* - * Get the max PASID value that can be used by the function - */ -int ocxl_config_get_pasid_info(struct pci_dev *dev, int *count); - /* * Tell an AFU, by writing in the configuration space, the PASIDs that * it can use. Range starts at 'pasid_base' and its size is a multiple @@ -188,15 +174,6 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr), void *xsl_err_data); -/** - * Update values within a Process Element - * - * link_handle: the link handle associated with the process element - * pasid: the PASID for the AFU context - * tid: the new thread id for the process element - */ -int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid); - /* * Remove a Process Element from the Shared Process Area for a link */ From 1ba2143606a10f1c2e7308bc7abd940a6381cffd Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:30 +1100 Subject: [PATCH 189/205] ocxl: Split pci.c In preparation for making core code available for external drivers, move the core code out of pci.c and into core.c Signed-off-by: Alastair D'Silva Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/Makefile | 1 + drivers/misc/ocxl/core.c | 517 +++++++++++++++++++++++++++++ drivers/misc/ocxl/ocxl_internal.h | 5 + drivers/misc/ocxl/pci.c | 519 +----------------------------- 4 files changed, 524 insertions(+), 518 deletions(-) create mode 100644 drivers/misc/ocxl/core.c diff --git a/drivers/misc/ocxl/Makefile b/drivers/misc/ocxl/Makefile index 5229dcda82974..bc4e39bfda7bf 100644 --- a/drivers/misc/ocxl/Makefile +++ b/drivers/misc/ocxl/Makefile @@ -3,6 +3,7 @@ ccflags-$(CONFIG_PPC_WERROR) += -Werror ocxl-y += main.o pci.o config.o file.o pasid.o ocxl-y += link.o context.o afu_irq.o sysfs.o trace.o +ocxl-y += core.o obj-$(CONFIG_OCXL) += ocxl.o # For tracepoints to include our trace.h from tracepoint infrastructure: diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c new file mode 100644 index 0000000000000..1a4411b72d35f --- /dev/null +++ b/drivers/misc/ocxl/core.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright 2019 IBM Corp. +#include +#include "ocxl_internal.h" + +static struct ocxl_fn *ocxl_fn_get(struct ocxl_fn *fn) +{ + return (get_device(&fn->dev) == NULL) ? NULL : fn; +} + +static void ocxl_fn_put(struct ocxl_fn *fn) +{ + put_device(&fn->dev); +} + +struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu) +{ + return (get_device(&afu->dev) == NULL) ? NULL : afu; +} + +void ocxl_afu_put(struct ocxl_afu *afu) +{ + put_device(&afu->dev); +} + +static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn) +{ + struct ocxl_afu *afu; + + afu = kzalloc(sizeof(struct ocxl_afu), GFP_KERNEL); + if (!afu) + return NULL; + + mutex_init(&afu->contexts_lock); + mutex_init(&afu->afu_control_lock); + idr_init(&afu->contexts_idr); + afu->fn = fn; + ocxl_fn_get(fn); + return afu; +} + +static void free_afu(struct ocxl_afu *afu) +{ + idr_destroy(&afu->contexts_idr); + ocxl_fn_put(afu->fn); + kfree(afu); +} + +static void free_afu_dev(struct device *dev) +{ + struct ocxl_afu *afu = to_ocxl_afu(dev); + + ocxl_unregister_afu(afu); + free_afu(afu); +} + +static int set_afu_device(struct ocxl_afu *afu, const char *location) +{ + struct ocxl_fn *fn = afu->fn; + int rc; + + afu->dev.parent = &fn->dev; + afu->dev.release = free_afu_dev; + rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location, + afu->config.idx); + return rc; +} + +static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev) +{ + struct ocxl_fn *fn = afu->fn; + int actag_count, actag_offset; + + /* + * if there were not enough actags for the function, each afu + * reduces its count as well + */ + actag_count = afu->config.actag_supported * + fn->actag_enabled / fn->actag_supported; + actag_offset = ocxl_actag_afu_alloc(fn, actag_count); + if (actag_offset < 0) { + dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n", + actag_count, actag_offset); + return actag_offset; + } + afu->actag_base = fn->actag_base + actag_offset; + afu->actag_enabled = actag_count; + + ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos, + afu->actag_base, afu->actag_enabled); + dev_dbg(&afu->dev, "actag base=%d enabled=%d\n", + afu->actag_base, afu->actag_enabled); + return 0; +} + +static void reclaim_afu_actag(struct ocxl_afu *afu) +{ + struct ocxl_fn *fn = afu->fn; + int start_offset, size; + + start_offset = afu->actag_base - fn->actag_base; + size = afu->actag_enabled; + ocxl_actag_afu_free(afu->fn, start_offset, size); +} + +static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev) +{ + struct ocxl_fn *fn = afu->fn; + int pasid_count, pasid_offset; + + /* + * We only support the case where the function configuration + * requested enough PASIDs to cover all AFUs. + */ + pasid_count = 1 << afu->config.pasid_supported_log; + pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count); + if (pasid_offset < 0) { + dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n", + pasid_count, pasid_offset); + return pasid_offset; + } + afu->pasid_base = fn->pasid_base + pasid_offset; + afu->pasid_count = 0; + afu->pasid_max = pasid_count; + + ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos, + afu->pasid_base, + afu->config.pasid_supported_log); + dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n", + afu->pasid_base, pasid_count); + return 0; +} + +static void reclaim_afu_pasid(struct ocxl_afu *afu) +{ + struct ocxl_fn *fn = afu->fn; + int start_offset, size; + + start_offset = afu->pasid_base - fn->pasid_base; + size = 1 << afu->config.pasid_supported_log; + ocxl_pasid_afu_free(afu->fn, start_offset, size); +} + +static int reserve_fn_bar(struct ocxl_fn *fn, int bar) +{ + struct pci_dev *dev = to_pci_dev(fn->dev.parent); + int rc, idx; + + if (bar != 0 && bar != 2 && bar != 4) + return -EINVAL; + + idx = bar >> 1; + if (fn->bar_used[idx]++ == 0) { + rc = pci_request_region(dev, bar, "ocxl"); + if (rc) + return rc; + } + return 0; +} + +static void release_fn_bar(struct ocxl_fn *fn, int bar) +{ + struct pci_dev *dev = to_pci_dev(fn->dev.parent); + int idx; + + if (bar != 0 && bar != 2 && bar != 4) + return; + + idx = bar >> 1; + if (--fn->bar_used[idx] == 0) + pci_release_region(dev, bar); + WARN_ON(fn->bar_used[idx] < 0); +} + +static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev) +{ + int rc; + + rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar); + if (rc) + return rc; + + rc = reserve_fn_bar(afu->fn, afu->config.pp_mmio_bar); + if (rc) { + release_fn_bar(afu->fn, afu->config.global_mmio_bar); + return rc; + } + + afu->global_mmio_start = + pci_resource_start(dev, afu->config.global_mmio_bar) + + afu->config.global_mmio_offset; + afu->pp_mmio_start = + pci_resource_start(dev, afu->config.pp_mmio_bar) + + afu->config.pp_mmio_offset; + + afu->global_mmio_ptr = ioremap(afu->global_mmio_start, + afu->config.global_mmio_size); + if (!afu->global_mmio_ptr) { + release_fn_bar(afu->fn, afu->config.pp_mmio_bar); + release_fn_bar(afu->fn, afu->config.global_mmio_bar); + dev_err(&dev->dev, "Error mapping global mmio area\n"); + return -ENOMEM; + } + + /* + * Leave an empty page between the per-process mmio area and + * the AFU interrupt mappings + */ + afu->irq_base_offset = afu->config.pp_mmio_stride + PAGE_SIZE; + return 0; +} + +static void unmap_mmio_areas(struct ocxl_afu *afu) +{ + if (afu->global_mmio_ptr) { + iounmap(afu->global_mmio_ptr); + afu->global_mmio_ptr = NULL; + } + afu->global_mmio_start = 0; + afu->pp_mmio_start = 0; + release_fn_bar(afu->fn, afu->config.pp_mmio_bar); + release_fn_bar(afu->fn, afu->config.global_mmio_bar); +} + +static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev) +{ + int rc; + + rc = ocxl_config_read_afu(dev, &afu->fn->config, &afu->config, afu_idx); + if (rc) + return rc; + + rc = set_afu_device(afu, dev_name(&dev->dev)); + if (rc) + return rc; + + rc = assign_afu_actag(afu, dev); + if (rc) + return rc; + + rc = assign_afu_pasid(afu, dev); + if (rc) { + reclaim_afu_actag(afu); + return rc; + } + + rc = map_mmio_areas(afu, dev); + if (rc) { + reclaim_afu_pasid(afu); + reclaim_afu_actag(afu); + return rc; + } + return 0; +} + +static void deconfigure_afu(struct ocxl_afu *afu) +{ + unmap_mmio_areas(afu); + reclaim_afu_pasid(afu); + reclaim_afu_actag(afu); +} + +static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu) +{ + int rc; + + ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1); + /* + * Char device creation is the last step, as processes can + * call our driver immediately, so all our inits must be finished. + */ + rc = ocxl_create_cdev(afu); + if (rc) + return rc; + return 0; +} + +static void deactivate_afu(struct ocxl_afu *afu) +{ + struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent); + + ocxl_destroy_cdev(afu); + ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0); +} + +int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx) +{ + int rc; + struct ocxl_afu *afu; + + afu = alloc_afu(fn); + if (!afu) + return -ENOMEM; + + rc = configure_afu(afu, afu_idx, dev); + if (rc) { + free_afu(afu); + return rc; + } + + rc = ocxl_register_afu(afu); + if (rc) + goto err; + + rc = ocxl_sysfs_add_afu(afu); + if (rc) + goto err; + + rc = activate_afu(dev, afu); + if (rc) + goto err_sys; + + list_add_tail(&afu->list, &fn->afu_list); + return 0; + +err_sys: + ocxl_sysfs_remove_afu(afu); +err: + deconfigure_afu(afu); + device_unregister(&afu->dev); + return rc; +} + +void remove_afu(struct ocxl_afu *afu) +{ + list_del(&afu->list); + ocxl_context_detach_all(afu); + deactivate_afu(afu); + ocxl_sysfs_remove_afu(afu); + deconfigure_afu(afu); + device_unregister(&afu->dev); +} + +static struct ocxl_fn *alloc_function(struct pci_dev *dev) +{ + struct ocxl_fn *fn; + + fn = kzalloc(sizeof(struct ocxl_fn), GFP_KERNEL); + if (!fn) + return NULL; + + INIT_LIST_HEAD(&fn->afu_list); + INIT_LIST_HEAD(&fn->pasid_list); + INIT_LIST_HEAD(&fn->actag_list); + return fn; +} + +static void free_function(struct ocxl_fn *fn) +{ + WARN_ON(!list_empty(&fn->afu_list)); + WARN_ON(!list_empty(&fn->pasid_list)); + kfree(fn); +} + +static void free_function_dev(struct device *dev) +{ + struct ocxl_fn *fn = to_ocxl_function(dev); + + free_function(fn); +} + +static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev) +{ + int rc; + + fn->dev.parent = &dev->dev; + fn->dev.release = free_function_dev; + rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev)); + if (rc) + return rc; + pci_set_drvdata(dev, fn); + return 0; +} + +static int assign_function_actag(struct ocxl_fn *fn) +{ + struct pci_dev *dev = to_pci_dev(fn->dev.parent); + u16 base, enabled, supported; + int rc; + + rc = ocxl_config_get_actag_info(dev, &base, &enabled, &supported); + if (rc) + return rc; + + fn->actag_base = base; + fn->actag_enabled = enabled; + fn->actag_supported = supported; + + ocxl_config_set_actag(dev, fn->config.dvsec_function_pos, + fn->actag_base, fn->actag_enabled); + dev_dbg(&fn->dev, "actag range starting at %d, enabled %d\n", + fn->actag_base, fn->actag_enabled); + return 0; +} + +static int set_function_pasid(struct ocxl_fn *fn) +{ + struct pci_dev *dev = to_pci_dev(fn->dev.parent); + int rc, desired_count, max_count; + + /* A function may not require any PASID */ + if (fn->config.max_pasid_log < 0) + return 0; + + rc = ocxl_config_get_pasid_info(dev, &max_count); + if (rc) + return rc; + + desired_count = 1 << fn->config.max_pasid_log; + + if (desired_count > max_count) { + dev_err(&fn->dev, + "Function requires more PASIDs than is available (%d vs. %d)\n", + desired_count, max_count); + return -ENOSPC; + } + + fn->pasid_base = 0; + return 0; +} + +static int configure_function(struct ocxl_fn *fn, struct pci_dev *dev) +{ + int rc; + + rc = pci_enable_device(dev); + if (rc) { + dev_err(&dev->dev, "pci_enable_device failed: %d\n", rc); + return rc; + } + + /* + * Once it has been confirmed to work on our hardware, we + * should reset the function, to force the adapter to restart + * from scratch. + * A function reset would also reset all its AFUs. + * + * Some hints for implementation: + * + * - there's not status bit to know when the reset is done. We + * should try reading the config space to know when it's + * done. + * - probably something like: + * Reset + * wait 100ms + * issue config read + * allow device up to 1 sec to return success on config + * read before declaring it broken + * + * Some shared logic on the card (CFG, TLX) won't be reset, so + * there's no guarantee that it will be enough. + */ + rc = ocxl_config_read_function(dev, &fn->config); + if (rc) + return rc; + + rc = set_function_device(fn, dev); + if (rc) + return rc; + + rc = assign_function_actag(fn); + if (rc) + return rc; + + rc = set_function_pasid(fn); + if (rc) + return rc; + + rc = ocxl_link_setup(dev, 0, &fn->link); + if (rc) + return rc; + + rc = ocxl_config_set_TL(dev, fn->config.dvsec_tl_pos); + if (rc) { + ocxl_link_release(dev, fn->link); + return rc; + } + return 0; +} + +static void deconfigure_function(struct ocxl_fn *fn) +{ + struct pci_dev *dev = to_pci_dev(fn->dev.parent); + + ocxl_link_release(dev, fn->link); + pci_disable_device(dev); +} + +struct ocxl_fn *init_function(struct pci_dev *dev) +{ + struct ocxl_fn *fn; + int rc; + + fn = alloc_function(dev); + if (!fn) + return ERR_PTR(-ENOMEM); + + rc = configure_function(fn, dev); + if (rc) { + free_function(fn); + return ERR_PTR(rc); + } + + rc = device_register(&fn->dev); + if (rc) { + deconfigure_function(fn); + put_device(&fn->dev); + return ERR_PTR(rc); + } + return fn; +} + +void remove_function(struct ocxl_fn *fn) +{ + deconfigure_function(fn); + device_unregister(&fn->dev); +} diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 06fd98c989c8d..81086534dab57 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -150,4 +150,9 @@ int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd); u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset); +struct ocxl_fn *init_function(struct pci_dev *dev); +void remove_function(struct ocxl_fn *fn); +int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx); +void remove_afu(struct ocxl_afu *afu); + #endif /* _OCXL_INTERNAL_H_ */ diff --git a/drivers/misc/ocxl/pci.c b/drivers/misc/ocxl/pci.c index 21f425472a82f..4ed7cb1a667f5 100644 --- a/drivers/misc/ocxl/pci.c +++ b/drivers/misc/ocxl/pci.c @@ -1,9 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ -// Copyright 2017 IBM Corp. +// Copyright 2019 IBM Corp. #include -#include -#include -#include #include "ocxl_internal.h" /* @@ -17,520 +14,6 @@ static const struct pci_device_id ocxl_pci_tbl[] = { }; MODULE_DEVICE_TABLE(pci, ocxl_pci_tbl); - -static struct ocxl_fn *ocxl_fn_get(struct ocxl_fn *fn) -{ - return (get_device(&fn->dev) == NULL) ? NULL : fn; -} - -static void ocxl_fn_put(struct ocxl_fn *fn) -{ - put_device(&fn->dev); -} - -struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu) -{ - return (get_device(&afu->dev) == NULL) ? NULL : afu; -} - -void ocxl_afu_put(struct ocxl_afu *afu) -{ - put_device(&afu->dev); -} - -static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn) -{ - struct ocxl_afu *afu; - - afu = kzalloc(sizeof(struct ocxl_afu), GFP_KERNEL); - if (!afu) - return NULL; - - mutex_init(&afu->contexts_lock); - mutex_init(&afu->afu_control_lock); - idr_init(&afu->contexts_idr); - afu->fn = fn; - ocxl_fn_get(fn); - return afu; -} - -static void free_afu(struct ocxl_afu *afu) -{ - idr_destroy(&afu->contexts_idr); - ocxl_fn_put(afu->fn); - kfree(afu); -} - -static void free_afu_dev(struct device *dev) -{ - struct ocxl_afu *afu = to_ocxl_afu(dev); - - ocxl_unregister_afu(afu); - free_afu(afu); -} - -static int set_afu_device(struct ocxl_afu *afu, const char *location) -{ - struct ocxl_fn *fn = afu->fn; - int rc; - - afu->dev.parent = &fn->dev; - afu->dev.release = free_afu_dev; - rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location, - afu->config.idx); - return rc; -} - -static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev) -{ - struct ocxl_fn *fn = afu->fn; - int actag_count, actag_offset; - - /* - * if there were not enough actags for the function, each afu - * reduces its count as well - */ - actag_count = afu->config.actag_supported * - fn->actag_enabled / fn->actag_supported; - actag_offset = ocxl_actag_afu_alloc(fn, actag_count); - if (actag_offset < 0) { - dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n", - actag_count, actag_offset); - return actag_offset; - } - afu->actag_base = fn->actag_base + actag_offset; - afu->actag_enabled = actag_count; - - ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos, - afu->actag_base, afu->actag_enabled); - dev_dbg(&afu->dev, "actag base=%d enabled=%d\n", - afu->actag_base, afu->actag_enabled); - return 0; -} - -static void reclaim_afu_actag(struct ocxl_afu *afu) -{ - struct ocxl_fn *fn = afu->fn; - int start_offset, size; - - start_offset = afu->actag_base - fn->actag_base; - size = afu->actag_enabled; - ocxl_actag_afu_free(afu->fn, start_offset, size); -} - -static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev) -{ - struct ocxl_fn *fn = afu->fn; - int pasid_count, pasid_offset; - - /* - * We only support the case where the function configuration - * requested enough PASIDs to cover all AFUs. - */ - pasid_count = 1 << afu->config.pasid_supported_log; - pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count); - if (pasid_offset < 0) { - dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n", - pasid_count, pasid_offset); - return pasid_offset; - } - afu->pasid_base = fn->pasid_base + pasid_offset; - afu->pasid_count = 0; - afu->pasid_max = pasid_count; - - ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos, - afu->pasid_base, - afu->config.pasid_supported_log); - dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n", - afu->pasid_base, pasid_count); - return 0; -} - -static void reclaim_afu_pasid(struct ocxl_afu *afu) -{ - struct ocxl_fn *fn = afu->fn; - int start_offset, size; - - start_offset = afu->pasid_base - fn->pasid_base; - size = 1 << afu->config.pasid_supported_log; - ocxl_pasid_afu_free(afu->fn, start_offset, size); -} - -static int reserve_fn_bar(struct ocxl_fn *fn, int bar) -{ - struct pci_dev *dev = to_pci_dev(fn->dev.parent); - int rc, idx; - - if (bar != 0 && bar != 2 && bar != 4) - return -EINVAL; - - idx = bar >> 1; - if (fn->bar_used[idx]++ == 0) { - rc = pci_request_region(dev, bar, "ocxl"); - if (rc) - return rc; - } - return 0; -} - -static void release_fn_bar(struct ocxl_fn *fn, int bar) -{ - struct pci_dev *dev = to_pci_dev(fn->dev.parent); - int idx; - - if (bar != 0 && bar != 2 && bar != 4) - return; - - idx = bar >> 1; - if (--fn->bar_used[idx] == 0) - pci_release_region(dev, bar); - WARN_ON(fn->bar_used[idx] < 0); -} - -static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev) -{ - int rc; - - rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar); - if (rc) - return rc; - - rc = reserve_fn_bar(afu->fn, afu->config.pp_mmio_bar); - if (rc) { - release_fn_bar(afu->fn, afu->config.global_mmio_bar); - return rc; - } - - afu->global_mmio_start = - pci_resource_start(dev, afu->config.global_mmio_bar) + - afu->config.global_mmio_offset; - afu->pp_mmio_start = - pci_resource_start(dev, afu->config.pp_mmio_bar) + - afu->config.pp_mmio_offset; - - afu->global_mmio_ptr = ioremap(afu->global_mmio_start, - afu->config.global_mmio_size); - if (!afu->global_mmio_ptr) { - release_fn_bar(afu->fn, afu->config.pp_mmio_bar); - release_fn_bar(afu->fn, afu->config.global_mmio_bar); - dev_err(&dev->dev, "Error mapping global mmio area\n"); - return -ENOMEM; - } - - /* - * Leave an empty page between the per-process mmio area and - * the AFU interrupt mappings - */ - afu->irq_base_offset = afu->config.pp_mmio_stride + PAGE_SIZE; - return 0; -} - -static void unmap_mmio_areas(struct ocxl_afu *afu) -{ - if (afu->global_mmio_ptr) { - iounmap(afu->global_mmio_ptr); - afu->global_mmio_ptr = NULL; - } - afu->global_mmio_start = 0; - afu->pp_mmio_start = 0; - release_fn_bar(afu->fn, afu->config.pp_mmio_bar); - release_fn_bar(afu->fn, afu->config.global_mmio_bar); -} - -static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev) -{ - int rc; - - rc = ocxl_config_read_afu(dev, &afu->fn->config, &afu->config, afu_idx); - if (rc) - return rc; - - rc = set_afu_device(afu, dev_name(&dev->dev)); - if (rc) - return rc; - - rc = assign_afu_actag(afu, dev); - if (rc) - return rc; - - rc = assign_afu_pasid(afu, dev); - if (rc) { - reclaim_afu_actag(afu); - return rc; - } - - rc = map_mmio_areas(afu, dev); - if (rc) { - reclaim_afu_pasid(afu); - reclaim_afu_actag(afu); - return rc; - } - return 0; -} - -static void deconfigure_afu(struct ocxl_afu *afu) -{ - unmap_mmio_areas(afu); - reclaim_afu_pasid(afu); - reclaim_afu_actag(afu); -} - -static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu) -{ - int rc; - - ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1); - /* - * Char device creation is the last step, as processes can - * call our driver immediately, so all our inits must be finished. - */ - rc = ocxl_create_cdev(afu); - if (rc) - return rc; - return 0; -} - -static void deactivate_afu(struct ocxl_afu *afu) -{ - struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent); - - ocxl_destroy_cdev(afu); - ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0); -} - -static int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx) -{ - int rc; - struct ocxl_afu *afu; - - afu = alloc_afu(fn); - if (!afu) - return -ENOMEM; - - rc = configure_afu(afu, afu_idx, dev); - if (rc) { - free_afu(afu); - return rc; - } - - rc = ocxl_register_afu(afu); - if (rc) - goto err; - - rc = ocxl_sysfs_add_afu(afu); - if (rc) - goto err; - - rc = activate_afu(dev, afu); - if (rc) - goto err_sys; - - list_add_tail(&afu->list, &fn->afu_list); - return 0; - -err_sys: - ocxl_sysfs_remove_afu(afu); -err: - deconfigure_afu(afu); - device_unregister(&afu->dev); - return rc; -} - -static void remove_afu(struct ocxl_afu *afu) -{ - list_del(&afu->list); - ocxl_context_detach_all(afu); - deactivate_afu(afu); - ocxl_sysfs_remove_afu(afu); - deconfigure_afu(afu); - device_unregister(&afu->dev); -} - -static struct ocxl_fn *alloc_function(struct pci_dev *dev) -{ - struct ocxl_fn *fn; - - fn = kzalloc(sizeof(struct ocxl_fn), GFP_KERNEL); - if (!fn) - return NULL; - - INIT_LIST_HEAD(&fn->afu_list); - INIT_LIST_HEAD(&fn->pasid_list); - INIT_LIST_HEAD(&fn->actag_list); - return fn; -} - -static void free_function(struct ocxl_fn *fn) -{ - WARN_ON(!list_empty(&fn->afu_list)); - WARN_ON(!list_empty(&fn->pasid_list)); - kfree(fn); -} - -static void free_function_dev(struct device *dev) -{ - struct ocxl_fn *fn = to_ocxl_function(dev); - - free_function(fn); -} - -static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev) -{ - int rc; - - fn->dev.parent = &dev->dev; - fn->dev.release = free_function_dev; - rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev)); - if (rc) - return rc; - pci_set_drvdata(dev, fn); - return 0; -} - -static int assign_function_actag(struct ocxl_fn *fn) -{ - struct pci_dev *dev = to_pci_dev(fn->dev.parent); - u16 base, enabled, supported; - int rc; - - rc = ocxl_config_get_actag_info(dev, &base, &enabled, &supported); - if (rc) - return rc; - - fn->actag_base = base; - fn->actag_enabled = enabled; - fn->actag_supported = supported; - - ocxl_config_set_actag(dev, fn->config.dvsec_function_pos, - fn->actag_base, fn->actag_enabled); - dev_dbg(&fn->dev, "actag range starting at %d, enabled %d\n", - fn->actag_base, fn->actag_enabled); - return 0; -} - -static int set_function_pasid(struct ocxl_fn *fn) -{ - struct pci_dev *dev = to_pci_dev(fn->dev.parent); - int rc, desired_count, max_count; - - /* A function may not require any PASID */ - if (fn->config.max_pasid_log < 0) - return 0; - - rc = ocxl_config_get_pasid_info(dev, &max_count); - if (rc) - return rc; - - desired_count = 1 << fn->config.max_pasid_log; - - if (desired_count > max_count) { - dev_err(&fn->dev, - "Function requires more PASIDs than is available (%d vs. %d)\n", - desired_count, max_count); - return -ENOSPC; - } - - fn->pasid_base = 0; - return 0; -} - -static int configure_function(struct ocxl_fn *fn, struct pci_dev *dev) -{ - int rc; - - rc = pci_enable_device(dev); - if (rc) { - dev_err(&dev->dev, "pci_enable_device failed: %d\n", rc); - return rc; - } - - /* - * Once it has been confirmed to work on our hardware, we - * should reset the function, to force the adapter to restart - * from scratch. - * A function reset would also reset all its AFUs. - * - * Some hints for implementation: - * - * - there's not status bit to know when the reset is done. We - * should try reading the config space to know when it's - * done. - * - probably something like: - * Reset - * wait 100ms - * issue config read - * allow device up to 1 sec to return success on config - * read before declaring it broken - * - * Some shared logic on the card (CFG, TLX) won't be reset, so - * there's no guarantee that it will be enough. - */ - rc = ocxl_config_read_function(dev, &fn->config); - if (rc) - return rc; - - rc = set_function_device(fn, dev); - if (rc) - return rc; - - rc = assign_function_actag(fn); - if (rc) - return rc; - - rc = set_function_pasid(fn); - if (rc) - return rc; - - rc = ocxl_link_setup(dev, 0, &fn->link); - if (rc) - return rc; - - rc = ocxl_config_set_TL(dev, fn->config.dvsec_tl_pos); - if (rc) { - ocxl_link_release(dev, fn->link); - return rc; - } - return 0; -} - -static void deconfigure_function(struct ocxl_fn *fn) -{ - struct pci_dev *dev = to_pci_dev(fn->dev.parent); - - ocxl_link_release(dev, fn->link); - pci_disable_device(dev); -} - -static struct ocxl_fn *init_function(struct pci_dev *dev) -{ - struct ocxl_fn *fn; - int rc; - - fn = alloc_function(dev); - if (!fn) - return ERR_PTR(-ENOMEM); - - rc = configure_function(fn, dev); - if (rc) { - free_function(fn); - return ERR_PTR(rc); - } - - rc = device_register(&fn->dev); - if (rc) { - deconfigure_function(fn); - put_device(&fn->dev); - return ERR_PTR(rc); - } - return fn; -} - -static void remove_function(struct ocxl_fn *fn) -{ - deconfigure_function(fn); - device_unregister(&fn->dev); -} - static int ocxl_probe(struct pci_dev *dev, const struct pci_device_id *id) { int rc, afu_count = 0; From 2f7d3d1453813cda13e5bace24093eac22cb5e61 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:31 +1100 Subject: [PATCH 190/205] ocxl: Don't pass pci_dev around This data is already available in a struct Signed-off-by: Alastair D'Silva Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/core.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c index 1a4411b72d35f..2f2fe12eac1ed 100644 --- a/drivers/misc/ocxl/core.c +++ b/drivers/misc/ocxl/core.c @@ -66,10 +66,11 @@ static int set_afu_device(struct ocxl_afu *afu, const char *location) return rc; } -static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev) +static int assign_afu_actag(struct ocxl_afu *afu) { struct ocxl_fn *fn = afu->fn; int actag_count, actag_offset; + struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent); /* * if there were not enough actags for the function, each afu @@ -79,16 +80,16 @@ static int assign_afu_actag(struct ocxl_afu *afu, struct pci_dev *dev) fn->actag_enabled / fn->actag_supported; actag_offset = ocxl_actag_afu_alloc(fn, actag_count); if (actag_offset < 0) { - dev_err(&afu->dev, "Can't allocate %d actags for AFU: %d\n", + dev_err(&pci_dev->dev, "Can't allocate %d actags for AFU: %d\n", actag_count, actag_offset); return actag_offset; } afu->actag_base = fn->actag_base + actag_offset; afu->actag_enabled = actag_count; - ocxl_config_set_afu_actag(dev, afu->config.dvsec_afu_control_pos, + ocxl_config_set_afu_actag(pci_dev, afu->config.dvsec_afu_control_pos, afu->actag_base, afu->actag_enabled); - dev_dbg(&afu->dev, "actag base=%d enabled=%d\n", + dev_dbg(&pci_dev->dev, "actag base=%d enabled=%d\n", afu->actag_base, afu->actag_enabled); return 0; } @@ -103,10 +104,11 @@ static void reclaim_afu_actag(struct ocxl_afu *afu) ocxl_actag_afu_free(afu->fn, start_offset, size); } -static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev) +static int assign_afu_pasid(struct ocxl_afu *afu) { struct ocxl_fn *fn = afu->fn; int pasid_count, pasid_offset; + struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent); /* * We only support the case where the function configuration @@ -115,7 +117,7 @@ static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev) pasid_count = 1 << afu->config.pasid_supported_log; pasid_offset = ocxl_pasid_afu_alloc(fn, pasid_count); if (pasid_offset < 0) { - dev_err(&afu->dev, "Can't allocate %d PASIDs for AFU: %d\n", + dev_err(&pci_dev->dev, "Can't allocate %d PASIDs for AFU: %d\n", pasid_count, pasid_offset); return pasid_offset; } @@ -123,10 +125,10 @@ static int assign_afu_pasid(struct ocxl_afu *afu, struct pci_dev *dev) afu->pasid_count = 0; afu->pasid_max = pasid_count; - ocxl_config_set_afu_pasid(dev, afu->config.dvsec_afu_control_pos, + ocxl_config_set_afu_pasid(pci_dev, afu->config.dvsec_afu_control_pos, afu->pasid_base, afu->config.pasid_supported_log); - dev_dbg(&afu->dev, "PASID base=%d, enabled=%d\n", + dev_dbg(&pci_dev->dev, "PASID base=%d, enabled=%d\n", afu->pasid_base, pasid_count); return 0; } @@ -172,9 +174,10 @@ static void release_fn_bar(struct ocxl_fn *fn, int bar) WARN_ON(fn->bar_used[idx] < 0); } -static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev) +static int map_mmio_areas(struct ocxl_afu *afu) { int rc; + struct pci_dev *pci_dev = to_pci_dev(afu->fn->dev.parent); rc = reserve_fn_bar(afu->fn, afu->config.global_mmio_bar); if (rc) @@ -187,10 +190,10 @@ static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev) } afu->global_mmio_start = - pci_resource_start(dev, afu->config.global_mmio_bar) + + pci_resource_start(pci_dev, afu->config.global_mmio_bar) + afu->config.global_mmio_offset; afu->pp_mmio_start = - pci_resource_start(dev, afu->config.pp_mmio_bar) + + pci_resource_start(pci_dev, afu->config.pp_mmio_bar) + afu->config.pp_mmio_offset; afu->global_mmio_ptr = ioremap(afu->global_mmio_start, @@ -198,7 +201,7 @@ static int map_mmio_areas(struct ocxl_afu *afu, struct pci_dev *dev) if (!afu->global_mmio_ptr) { release_fn_bar(afu->fn, afu->config.pp_mmio_bar); release_fn_bar(afu->fn, afu->config.global_mmio_bar); - dev_err(&dev->dev, "Error mapping global mmio area\n"); + dev_err(&pci_dev->dev, "Error mapping global mmio area\n"); return -ENOMEM; } @@ -234,17 +237,17 @@ static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev) if (rc) return rc; - rc = assign_afu_actag(afu, dev); + rc = assign_afu_actag(afu); if (rc) return rc; - rc = assign_afu_pasid(afu, dev); + rc = assign_afu_pasid(afu); if (rc) { reclaim_afu_actag(afu); return rc; } - rc = map_mmio_areas(afu, dev); + rc = map_mmio_areas(afu); if (rc) { reclaim_afu_pasid(afu); reclaim_afu_actag(afu); @@ -331,7 +334,7 @@ void remove_afu(struct ocxl_afu *afu) device_unregister(&afu->dev); } -static struct ocxl_fn *alloc_function(struct pci_dev *dev) +static struct ocxl_fn *alloc_function(void) { struct ocxl_fn *fn; @@ -342,6 +345,7 @@ static struct ocxl_fn *alloc_function(struct pci_dev *dev) INIT_LIST_HEAD(&fn->afu_list); INIT_LIST_HEAD(&fn->pasid_list); INIT_LIST_HEAD(&fn->actag_list); + return fn; } @@ -491,7 +495,7 @@ struct ocxl_fn *init_function(struct pci_dev *dev) struct ocxl_fn *fn; int rc; - fn = alloc_function(dev); + fn = alloc_function(); if (!fn) return ERR_PTR(-ENOMEM); From 75ca758adbafc81804c39b2c200ecdc819a6c042 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:32 +1100 Subject: [PATCH 191/205] ocxl: Create a clear delineation between ocxl backend & frontend The OCXL driver contains both frontend code for interacting with userspace, as well as backend code for interacting with the hardware. This patch separates the backend code from the frontend so that it can be used by other device drivers that communicate via OpenCAPI. Relocate dev, cdev & sysfs files to the frontend code to allow external drivers to maintain their own devices. Reference counting on the device in the backend is replaced with kref counting. Move file & sysfs layer initialisation from core.c (backend) to pci.c (frontend). Create an ocxl_function oriented interface for initing devices & enumerating AFUs. Signed-off-by: Alastair D'Silva Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/context.c | 2 +- drivers/misc/ocxl/core.c | 201 +++++++++++++++++++----------- drivers/misc/ocxl/file.c | 142 ++++++++++++++------- drivers/misc/ocxl/ocxl_internal.h | 31 ++--- drivers/misc/ocxl/pci.c | 56 ++++----- drivers/misc/ocxl/sysfs.c | 54 ++++---- include/misc/ocxl.h | 119 ++++++++++++++++-- 7 files changed, 409 insertions(+), 196 deletions(-) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index c10a940e3b387..c73a859d2224f 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -238,7 +238,7 @@ int ocxl_context_detach(struct ocxl_context *ctx) } rc = ocxl_link_remove_pe(ctx->afu->fn->link, ctx->pasid); if (rc) { - dev_warn(&ctx->afu->dev, + dev_warn(&dev->dev, "Couldn't remove PE entry cleanly: %d\n", rc); } return 0; diff --git a/drivers/misc/ocxl/core.c b/drivers/misc/ocxl/core.c index 2f2fe12eac1ed..b7a09b21ab36f 100644 --- a/drivers/misc/ocxl/core.c +++ b/drivers/misc/ocxl/core.c @@ -13,16 +13,6 @@ static void ocxl_fn_put(struct ocxl_fn *fn) put_device(&fn->dev); } -struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu) -{ - return (get_device(&afu->dev) == NULL) ? NULL : afu; -} - -void ocxl_afu_put(struct ocxl_afu *afu) -{ - put_device(&afu->dev); -} - static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn) { struct ocxl_afu *afu; @@ -31,6 +21,7 @@ static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn) if (!afu) return NULL; + kref_init(&afu->kref); mutex_init(&afu->contexts_lock); mutex_init(&afu->afu_control_lock); idr_init(&afu->contexts_idr); @@ -39,32 +30,26 @@ static struct ocxl_afu *alloc_afu(struct ocxl_fn *fn) return afu; } -static void free_afu(struct ocxl_afu *afu) +static void free_afu(struct kref *kref) { + struct ocxl_afu *afu = container_of(kref, struct ocxl_afu, kref); + idr_destroy(&afu->contexts_idr); ocxl_fn_put(afu->fn); kfree(afu); } -static void free_afu_dev(struct device *dev) +void ocxl_afu_get(struct ocxl_afu *afu) { - struct ocxl_afu *afu = to_ocxl_afu(dev); - - ocxl_unregister_afu(afu); - free_afu(afu); + kref_get(&afu->kref); } +EXPORT_SYMBOL_GPL(ocxl_afu_get); -static int set_afu_device(struct ocxl_afu *afu, const char *location) +void ocxl_afu_put(struct ocxl_afu *afu) { - struct ocxl_fn *fn = afu->fn; - int rc; - - afu->dev.parent = &fn->dev; - afu->dev.release = free_afu_dev; - rc = dev_set_name(&afu->dev, "%s.%s.%hhu", afu->config.name, location, - afu->config.idx); - return rc; + kref_put(&afu->kref, free_afu); } +EXPORT_SYMBOL_GPL(ocxl_afu_put); static int assign_afu_actag(struct ocxl_afu *afu) { @@ -233,27 +218,25 @@ static int configure_afu(struct ocxl_afu *afu, u8 afu_idx, struct pci_dev *dev) if (rc) return rc; - rc = set_afu_device(afu, dev_name(&dev->dev)); - if (rc) - return rc; - rc = assign_afu_actag(afu); if (rc) return rc; rc = assign_afu_pasid(afu); - if (rc) { - reclaim_afu_actag(afu); - return rc; - } + if (rc) + goto err_free_actag; rc = map_mmio_areas(afu); - if (rc) { - reclaim_afu_pasid(afu); - reclaim_afu_actag(afu); - return rc; - } + if (rc) + goto err_free_pasid; + return 0; + +err_free_pasid: + reclaim_afu_pasid(afu); +err_free_actag: + reclaim_afu_actag(afu); + return rc; } static void deconfigure_afu(struct ocxl_afu *afu) @@ -265,16 +248,8 @@ static void deconfigure_afu(struct ocxl_afu *afu) static int activate_afu(struct pci_dev *dev, struct ocxl_afu *afu) { - int rc; - ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 1); - /* - * Char device creation is the last step, as processes can - * call our driver immediately, so all our inits must be finished. - */ - rc = ocxl_create_cdev(afu); - if (rc) - return rc; + return 0; } @@ -282,11 +257,10 @@ static void deactivate_afu(struct ocxl_afu *afu) { struct pci_dev *dev = to_pci_dev(afu->fn->dev.parent); - ocxl_destroy_cdev(afu); ocxl_config_set_afu_state(dev, afu->config.dvsec_afu_control_pos, 0); } -int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx) +static int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx) { int rc; struct ocxl_afu *afu; @@ -297,41 +271,29 @@ int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx) rc = configure_afu(afu, afu_idx, dev); if (rc) { - free_afu(afu); + ocxl_afu_put(afu); return rc; } - rc = ocxl_register_afu(afu); - if (rc) - goto err; - - rc = ocxl_sysfs_add_afu(afu); - if (rc) - goto err; - rc = activate_afu(dev, afu); - if (rc) - goto err_sys; + if (rc) { + deconfigure_afu(afu); + ocxl_afu_put(afu); + return rc; + } list_add_tail(&afu->list, &fn->afu_list); - return 0; -err_sys: - ocxl_sysfs_remove_afu(afu); -err: - deconfigure_afu(afu); - device_unregister(&afu->dev); - return rc; + return 0; } -void remove_afu(struct ocxl_afu *afu) +static void remove_afu(struct ocxl_afu *afu) { list_del(&afu->list); ocxl_context_detach_all(afu); deactivate_afu(afu); - ocxl_sysfs_remove_afu(afu); deconfigure_afu(afu); - device_unregister(&afu->dev); + ocxl_afu_put(afu); // matches the implicit get in alloc_afu } static struct ocxl_fn *alloc_function(void) @@ -358,7 +320,7 @@ static void free_function(struct ocxl_fn *fn) static void free_function_dev(struct device *dev) { - struct ocxl_fn *fn = to_ocxl_function(dev); + struct ocxl_fn *fn = container_of(dev, struct ocxl_fn, dev); free_function(fn); } @@ -372,7 +334,6 @@ static int set_function_device(struct ocxl_fn *fn, struct pci_dev *dev) rc = dev_set_name(&fn->dev, "ocxlfn.%s", dev_name(&dev->dev)); if (rc) return rc; - pci_set_drvdata(dev, fn); return 0; } @@ -490,7 +451,7 @@ static void deconfigure_function(struct ocxl_fn *fn) pci_disable_device(dev); } -struct ocxl_fn *init_function(struct pci_dev *dev) +static struct ocxl_fn *init_function(struct pci_dev *dev) { struct ocxl_fn *fn; int rc; @@ -514,8 +475,100 @@ struct ocxl_fn *init_function(struct pci_dev *dev) return fn; } -void remove_function(struct ocxl_fn *fn) +// Device detection & initialisation + +struct ocxl_fn *ocxl_function_open(struct pci_dev *dev) +{ + int rc, afu_count = 0; + u8 afu; + struct ocxl_fn *fn; + + if (!radix_enabled()) { + dev_err(&dev->dev, "Unsupported memory model (hash)\n"); + return ERR_PTR(-ENODEV); + } + + fn = init_function(dev); + if (IS_ERR(fn)) { + dev_err(&dev->dev, "function init failed: %li\n", + PTR_ERR(fn)); + return fn; + } + + for (afu = 0; afu <= fn->config.max_afu_index; afu++) { + rc = ocxl_config_check_afu_index(dev, &fn->config, afu); + if (rc > 0) { + rc = init_afu(dev, fn, afu); + if (rc) { + dev_err(&dev->dev, + "Can't initialize AFU index %d\n", afu); + continue; + } + afu_count++; + } + } + dev_info(&dev->dev, "%d AFU(s) configured\n", afu_count); + return fn; +} +EXPORT_SYMBOL_GPL(ocxl_function_open); + +struct list_head *ocxl_function_afu_list(struct ocxl_fn *fn) +{ + return &fn->afu_list; +} +EXPORT_SYMBOL_GPL(ocxl_function_afu_list); + +struct ocxl_afu *ocxl_function_fetch_afu(struct ocxl_fn *fn, u8 afu_idx) +{ + struct ocxl_afu *afu; + + list_for_each_entry(afu, &fn->afu_list, list) { + if (afu->config.idx == afu_idx) + return afu; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ocxl_function_fetch_afu); + +const struct ocxl_fn_config *ocxl_function_config(struct ocxl_fn *fn) { + return &fn->config; +} +EXPORT_SYMBOL_GPL(ocxl_function_config); + +void ocxl_function_close(struct ocxl_fn *fn) +{ + struct ocxl_afu *afu, *tmp; + + list_for_each_entry_safe(afu, tmp, &fn->afu_list, list) { + remove_afu(afu); + } + deconfigure_function(fn); device_unregister(&fn->dev); } +EXPORT_SYMBOL_GPL(ocxl_function_close); + +// AFU Metadata + +struct ocxl_afu_config *ocxl_afu_config(struct ocxl_afu *afu) +{ + return &afu->config; +} +EXPORT_SYMBOL_GPL(ocxl_afu_config); + +void ocxl_afu_set_private(struct ocxl_afu *afu, void *private) +{ + afu->private = private; +} +EXPORT_SYMBOL_GPL(ocxl_afu_set_private); + +void *ocxl_afu_get_private(struct ocxl_afu *afu) +{ + if (afu) + return afu->private; + + return NULL; +} +EXPORT_SYMBOL_GPL(ocxl_afu_get_private); diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 009e09b7ded5e..7a38ea5af9db1 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -17,70 +17,60 @@ static struct class *ocxl_class; static struct mutex minors_idr_lock; static struct idr minors_idr; -static struct ocxl_afu *find_and_get_afu(dev_t devno) +static struct ocxl_file_info *find_file_info(dev_t devno) { - struct ocxl_afu *afu; - int afu_minor; + struct ocxl_file_info *info; - afu_minor = MINOR(devno); /* * We don't declare an RCU critical section here, as our AFU * is protected by a reference counter on the device. By the time the - * minor number of a device is removed from the idr, the ref count of + * info reference is removed from the idr, the ref count of * the device is already at 0, so no user API will access that AFU and * this function can't return it. */ - afu = idr_find(&minors_idr, afu_minor); - if (afu) - ocxl_afu_get(afu); - return afu; + info = idr_find(&minors_idr, MINOR(devno)); + return info; } -static int allocate_afu_minor(struct ocxl_afu *afu) +static int allocate_minor(struct ocxl_file_info *info) { int minor; mutex_lock(&minors_idr_lock); - minor = idr_alloc(&minors_idr, afu, 0, OCXL_NUM_MINORS, GFP_KERNEL); + minor = idr_alloc(&minors_idr, info, 0, OCXL_NUM_MINORS, GFP_KERNEL); mutex_unlock(&minors_idr_lock); return minor; } -static void free_afu_minor(struct ocxl_afu *afu) +static void free_minor(struct ocxl_file_info *info) { mutex_lock(&minors_idr_lock); - idr_remove(&minors_idr, MINOR(afu->dev.devt)); + idr_remove(&minors_idr, MINOR(info->dev.devt)); mutex_unlock(&minors_idr_lock); } static int afu_open(struct inode *inode, struct file *file) { - struct ocxl_afu *afu; + struct ocxl_file_info *info; struct ocxl_context *ctx; int rc; pr_debug("%s for device %x\n", __func__, inode->i_rdev); - afu = find_and_get_afu(inode->i_rdev); - if (!afu) + info = find_file_info(inode->i_rdev); + if (!info) return -ENODEV; ctx = ocxl_context_alloc(); - if (!ctx) { - rc = -ENOMEM; - goto put_afu; - } + if (!ctx) + return -ENOMEM; - rc = ocxl_context_init(ctx, afu, inode->i_mapping); + rc = ocxl_context_init(ctx, info->afu, inode->i_mapping); if (rc) - goto put_afu; + return rc; + file->private_data = ctx; - ocxl_afu_put(afu); return 0; - -put_afu: - ocxl_afu_put(afu); - return rc; } static long afu_ioctl_attach(struct ocxl_context *ctx, @@ -204,11 +194,16 @@ static long afu_ioctl(struct file *file, unsigned int cmd, struct ocxl_ioctl_irq_fd irq_fd; u64 irq_offset; long rc; + bool closed; pr_debug("%s for context %d, command %s\n", __func__, ctx->pasid, CMD_STR(cmd)); - if (ctx->status == CLOSED) + mutex_lock(&ctx->status_mutex); + closed = (ctx->status == CLOSED); + mutex_unlock(&ctx->status_mutex); + + if (closed) return -EIO; switch (cmd) { @@ -468,39 +463,102 @@ static const struct file_operations ocxl_afu_fops = { .release = afu_release, }; -int ocxl_create_cdev(struct ocxl_afu *afu) +// Free the info struct +static void info_release(struct device *dev) +{ + struct ocxl_file_info *info = container_of(dev, struct ocxl_file_info, dev); + + free_minor(info); + ocxl_afu_put(info->afu); + kfree(info); +} + +static int ocxl_file_make_visible(struct ocxl_file_info *info) { int rc; - cdev_init(&afu->cdev, &ocxl_afu_fops); - rc = cdev_add(&afu->cdev, afu->dev.devt, 1); + cdev_init(&info->cdev, &ocxl_afu_fops); + rc = cdev_add(&info->cdev, info->dev.devt, 1); if (rc) { - dev_err(&afu->dev, "Unable to add afu char device: %d\n", rc); + dev_err(&info->dev, "Unable to add afu char device: %d\n", rc); return rc; } + return 0; } -void ocxl_destroy_cdev(struct ocxl_afu *afu) +static void ocxl_file_make_invisible(struct ocxl_file_info *info) { - cdev_del(&afu->cdev); + cdev_del(&info->cdev); } -int ocxl_register_afu(struct ocxl_afu *afu) +int ocxl_file_register_afu(struct ocxl_afu *afu) { int minor; + int rc; + struct ocxl_file_info *info; + struct ocxl_fn *fn = afu->fn; + struct pci_dev *pci_dev = to_pci_dev(fn->dev.parent); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) + return -ENOMEM; - minor = allocate_afu_minor(afu); - if (minor < 0) + minor = allocate_minor(info); + if (minor < 0) { + kfree(info); return minor; - afu->dev.devt = MKDEV(MAJOR(ocxl_dev), minor); - afu->dev.class = ocxl_class; - return device_register(&afu->dev); + } + + info->dev.parent = &fn->dev; + info->dev.devt = MKDEV(MAJOR(ocxl_dev), minor); + info->dev.class = ocxl_class; + info->dev.release = info_release; + + info->afu = afu; + ocxl_afu_get(afu); + + rc = dev_set_name(&info->dev, "%s.%s.%hhu", + afu->config.name, dev_name(&pci_dev->dev), afu->config.idx); + if (rc) + goto err_put; + + rc = device_register(&info->dev); + if (rc) + goto err_put; + + rc = ocxl_sysfs_register_afu(info); + if (rc) + goto err_unregister; + + rc = ocxl_file_make_visible(info); + if (rc) + goto err_unregister; + + ocxl_afu_set_private(afu, info); + + return 0; + +err_unregister: + ocxl_sysfs_unregister_afu(info); // safe to call even if register failed + device_unregister(&info->dev); +err_put: + ocxl_afu_put(afu); + free_minor(info); + kfree(info); + return rc; } -void ocxl_unregister_afu(struct ocxl_afu *afu) +void ocxl_file_unregister_afu(struct ocxl_afu *afu) { - free_afu_minor(afu); + struct ocxl_file_info *info = ocxl_afu_get_private(afu); + + if (!info) + return; + + ocxl_file_make_invisible(info); + ocxl_sysfs_unregister_afu(info); + device_unregister(&info->dev); } static char *ocxl_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 81086534dab57..53b6c64a1bf04 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -11,9 +11,6 @@ #define MAX_IRQ_PER_LINK 2000 #define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK -#define to_ocxl_function(d) container_of(d, struct ocxl_fn, dev) -#define to_ocxl_afu(d) container_of(d, struct ocxl_afu, dev) - extern struct pci_driver ocxl_pci_driver; struct ocxl_fn { @@ -30,11 +27,17 @@ struct ocxl_fn { void *link; }; +struct ocxl_file_info { + struct ocxl_afu *afu; + struct device dev; + struct cdev cdev; + struct bin_attribute attr_global_mmio; +}; + struct ocxl_afu { + struct kref kref; struct ocxl_fn *fn; struct list_head list; - struct device dev; - struct cdev cdev; struct ocxl_afu_config config; int pasid_base; int pasid_count; /* opened contexts */ @@ -48,7 +51,7 @@ struct ocxl_afu { u64 irq_base_offset; void __iomem *global_mmio_ptr; u64 pp_mmio_start; - struct bin_attribute attr_global_mmio; + void *private; }; enum ocxl_context_status { @@ -91,13 +94,10 @@ struct ocxl_process_element { __be32 software_state; }; -struct ocxl_afu *ocxl_afu_get(struct ocxl_afu *afu); -void ocxl_afu_put(struct ocxl_afu *afu); - int ocxl_create_cdev(struct ocxl_afu *afu); void ocxl_destroy_cdev(struct ocxl_afu *afu); -int ocxl_register_afu(struct ocxl_afu *afu); -void ocxl_unregister_afu(struct ocxl_afu *afu); +int ocxl_file_register_afu(struct ocxl_afu *afu); +void ocxl_file_unregister_afu(struct ocxl_afu *afu); int ocxl_file_init(void); void ocxl_file_exit(void); @@ -140,8 +140,8 @@ int ocxl_context_detach(struct ocxl_context *ctx); void ocxl_context_detach_all(struct ocxl_afu *afu); void ocxl_context_free(struct ocxl_context *ctx); -int ocxl_sysfs_add_afu(struct ocxl_afu *afu); -void ocxl_sysfs_remove_afu(struct ocxl_afu *afu); +int ocxl_sysfs_register_afu(struct ocxl_file_info *info); +void ocxl_sysfs_unregister_afu(struct ocxl_file_info *info); int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset); int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset); @@ -150,9 +150,4 @@ int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd); u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset); -struct ocxl_fn *init_function(struct pci_dev *dev); -void remove_function(struct ocxl_fn *fn); -int init_afu(struct pci_dev *dev, struct ocxl_fn *fn, u8 afu_idx); -void remove_afu(struct ocxl_afu *afu); - #endif /* _OCXL_INTERNAL_H_ */ diff --git a/drivers/misc/ocxl/pci.c b/drivers/misc/ocxl/pci.c index 4ed7cb1a667f5..f2a3ef4b9bdde 100644 --- a/drivers/misc/ocxl/pci.c +++ b/drivers/misc/ocxl/pci.c @@ -16,47 +16,45 @@ MODULE_DEVICE_TABLE(pci, ocxl_pci_tbl); static int ocxl_probe(struct pci_dev *dev, const struct pci_device_id *id) { - int rc, afu_count = 0; - u8 afu; + int rc; + struct ocxl_afu *afu, *tmp; struct ocxl_fn *fn; + struct list_head *afu_list; - if (!radix_enabled()) { - dev_err(&dev->dev, "Unsupported memory model (hash)\n"); - return -ENODEV; - } - - fn = init_function(dev); - if (IS_ERR(fn)) { - dev_err(&dev->dev, "function init failed: %li\n", - PTR_ERR(fn)); + fn = ocxl_function_open(dev); + if (IS_ERR(fn)) return PTR_ERR(fn); - } - for (afu = 0; afu <= fn->config.max_afu_index; afu++) { - rc = ocxl_config_check_afu_index(dev, &fn->config, afu); - if (rc > 0) { - rc = init_afu(dev, fn, afu); - if (rc) { - dev_err(&dev->dev, - "Can't initialize AFU index %d\n", afu); - continue; - } - afu_count++; + pci_set_drvdata(dev, fn); + + afu_list = ocxl_function_afu_list(fn); + + list_for_each_entry_safe(afu, tmp, afu_list, list) { + // Cleanup handled within ocxl_file_register_afu() + rc = ocxl_file_register_afu(afu); + if (rc) { + dev_err(&dev->dev, "Failed to register AFU '%s' index %d", + afu->config.name, afu->config.idx); } } - dev_info(&dev->dev, "%d AFU(s) configured\n", afu_count); + return 0; } -static void ocxl_remove(struct pci_dev *dev) +void ocxl_remove(struct pci_dev *dev) { - struct ocxl_afu *afu, *tmp; - struct ocxl_fn *fn = pci_get_drvdata(dev); + struct ocxl_fn *fn; + struct ocxl_afu *afu; + struct list_head *afu_list; - list_for_each_entry_safe(afu, tmp, &fn->afu_list, list) { - remove_afu(afu); + fn = pci_get_drvdata(dev); + afu_list = ocxl_function_afu_list(fn); + + list_for_each_entry(afu, afu_list, list) { + ocxl_file_unregister_afu(afu); } - remove_function(fn); + + ocxl_function_close(fn); } struct pci_driver ocxl_pci_driver = { diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c index 0ab1fd1b26826..58f1ba2642068 100644 --- a/drivers/misc/ocxl/sysfs.c +++ b/drivers/misc/ocxl/sysfs.c @@ -3,11 +3,18 @@ #include #include "ocxl_internal.h" +static inline struct ocxl_afu *to_afu(struct device *device) +{ + struct ocxl_file_info *info = container_of(device, struct ocxl_file_info, dev); + + return info->afu; +} + static ssize_t global_mmio_size_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocxl_afu *afu = to_ocxl_afu(device); + struct ocxl_afu *afu = to_afu(device); return scnprintf(buf, PAGE_SIZE, "%d\n", afu->config.global_mmio_size); @@ -17,7 +24,7 @@ static ssize_t pp_mmio_size_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocxl_afu *afu = to_ocxl_afu(device); + struct ocxl_afu *afu = to_afu(device); return scnprintf(buf, PAGE_SIZE, "%d\n", afu->config.pp_mmio_stride); @@ -27,7 +34,7 @@ static ssize_t afu_version_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocxl_afu *afu = to_ocxl_afu(device); + struct ocxl_afu *afu = to_afu(device); return scnprintf(buf, PAGE_SIZE, "%hhu:%hhu\n", afu->config.version_major, @@ -38,7 +45,7 @@ static ssize_t contexts_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocxl_afu *afu = to_ocxl_afu(device); + struct ocxl_afu *afu = to_afu(device); return scnprintf(buf, PAGE_SIZE, "%d/%d\n", afu->pasid_count, afu->pasid_max); @@ -55,7 +62,7 @@ static ssize_t global_mmio_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { - struct ocxl_afu *afu = to_ocxl_afu(kobj_to_dev(kobj)); + struct ocxl_afu *afu = to_afu(kobj_to_dev(kobj)); if (count == 0 || off < 0 || off >= afu->config.global_mmio_size) @@ -86,7 +93,7 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, struct vm_area_struct *vma) { - struct ocxl_afu *afu = to_ocxl_afu(kobj_to_dev(kobj)); + struct ocxl_afu *afu = to_afu(kobj_to_dev(kobj)); if ((vma_pages(vma) + vma->vm_pgoff) > (afu->config.global_mmio_size >> PAGE_SHIFT)) @@ -99,27 +106,25 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, return 0; } -int ocxl_sysfs_add_afu(struct ocxl_afu *afu) +int ocxl_sysfs_register_afu(struct ocxl_file_info *info) { int i, rc; for (i = 0; i < ARRAY_SIZE(afu_attrs); i++) { - rc = device_create_file(&afu->dev, &afu_attrs[i]); + rc = device_create_file(&info->dev, &afu_attrs[i]); if (rc) goto err; } - sysfs_attr_init(&afu->attr_global_mmio.attr); - afu->attr_global_mmio.attr.name = "global_mmio_area"; - afu->attr_global_mmio.attr.mode = 0600; - afu->attr_global_mmio.size = afu->config.global_mmio_size; - afu->attr_global_mmio.read = global_mmio_read; - afu->attr_global_mmio.mmap = global_mmio_mmap; - rc = device_create_bin_file(&afu->dev, &afu->attr_global_mmio); + sysfs_attr_init(&info->attr_global_mmio.attr); + info->attr_global_mmio.attr.name = "global_mmio_area"; + info->attr_global_mmio.attr.mode = 0600; + info->attr_global_mmio.size = info->afu->config.global_mmio_size; + info->attr_global_mmio.read = global_mmio_read; + info->attr_global_mmio.mmap = global_mmio_mmap; + rc = device_create_bin_file(&info->dev, &info->attr_global_mmio); if (rc) { - dev_err(&afu->dev, - "Unable to create global mmio attr for afu: %d\n", - rc); + dev_err(&info->dev, "Unable to create global mmio attr for afu: %d\n", rc); goto err; } @@ -127,15 +132,20 @@ int ocxl_sysfs_add_afu(struct ocxl_afu *afu) err: for (i--; i >= 0; i--) - device_remove_file(&afu->dev, &afu_attrs[i]); + device_remove_file(&info->dev, &afu_attrs[i]); + return rc; } -void ocxl_sysfs_remove_afu(struct ocxl_afu *afu) +void ocxl_sysfs_unregister_afu(struct ocxl_file_info *info) { int i; + /* + * device_remove_bin_file is safe to call if the file is not added as + * the files are removed by name, and early exit if not found + */ for (i = 0; i < ARRAY_SIZE(afu_attrs); i++) - device_remove_file(&afu->dev, &afu_attrs[i]); - device_remove_bin_file(&afu->dev, &afu->attr_global_mmio); + device_remove_file(&info->dev, &afu_attrs[i]); + device_remove_bin_file(&info->dev, &info->attr_global_mmio); } diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index 9530d3be1b305..8bafd748e380c 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -16,11 +16,7 @@ #define OCXL_AFU_NAME_SZ (24+1) /* add 1 for NULL termination */ -/* - * The following 2 structures are a fairly generic way of representing - * the configuration data for a function and AFU, as read from the - * configuration space. - */ + struct ocxl_afu_config { u8 idx; int dvsec_afu_control_pos; /* offset of AFU control DVSEC */ @@ -49,12 +45,108 @@ struct ocxl_fn_config { s8 max_afu_index; }; -/* - * Read the configuration space of a function and fill in a - * ocxl_fn_config structure with all the function details +// These are opaque outside the ocxl driver +struct ocxl_afu; +struct ocxl_fn; + +// Device detection & initialisation + +/** + * Open an OpenCAPI function on an OpenCAPI device + * + * @dev: The PCI device that contains the function + * + * Returns an opaque pointer to the function, or an error pointer (check with IS_ERR) */ -int ocxl_config_read_function(struct pci_dev *dev, - struct ocxl_fn_config *fn); +struct ocxl_fn *ocxl_function_open(struct pci_dev *dev); + +/** + * Get the list of AFUs associated with a PCI function device + * + * Returns a list of struct ocxl_afu * + * + * @fn: The OpenCAPI function containing the AFUs + */ +struct list_head *ocxl_function_afu_list(struct ocxl_fn *fn); + +/** + * Fetch an AFU instance from an OpenCAPI function + * + * @fn: The OpenCAPI function to get the AFU from + * @afu_idx: The index of the AFU to get + * + * If successful, the AFU should be released with ocxl_afu_put() + * + * Returns a pointer to the AFU, or NULL on error + */ +struct ocxl_afu *ocxl_function_fetch_afu(struct ocxl_fn *fn, u8 afu_idx); + +/** + * Take a reference to an AFU + * + * @afu: The AFU to increment the reference count on + */ +void ocxl_afu_get(struct ocxl_afu *afu); + +/** + * Release a reference to an AFU + * + * @afu: The AFU to decrement the reference count on + */ +void ocxl_afu_put(struct ocxl_afu *afu); + + +/** + * Get the configuration information for an OpenCAPI function + * + * @fn: The OpenCAPI function to get the config for + * + * Returns the function config, or NULL on error + */ +const struct ocxl_fn_config *ocxl_function_config(struct ocxl_fn *fn); + +/** + * Close an OpenCAPI function + * + * This will free any AFUs previously retrieved from the function, and + * detach and associated contexts. The contexts must by freed by the caller. + * + * @fn: The OpenCAPI function to close + * + */ +void ocxl_function_close(struct ocxl_fn *fn); + +// AFU Metadata + +/** + * Get a pointer to the config for an AFU + * + * @afu: a pointer to the AFU to get the config for + * + * Returns a pointer to the AFU config + */ +struct ocxl_afu_config *ocxl_afu_config(struct ocxl_afu *afu); + +/** + * Assign opaque hardware specific information to an OpenCAPI AFU. + * + * @dev: The PCI device associated with the OpenCAPI device + * @private: the opaque hardware specific information to assign to the driver + */ +void ocxl_afu_set_private(struct ocxl_afu *afu, void *private); + +/** + * Fetch the hardware specific information associated with an external OpenCAPI + * AFU. This may be consumed by an external OpenCAPI driver. + * + * @afu: The AFU + * + * Returns the opaque pointer associated with the device, or NULL if not set + */ +void *ocxl_afu_get_private(struct ocxl_afu *dev); + + +// Functions left here are for compatibility with the cxlflash driver /* * Read the configuration space of a function for the AFU specified by @@ -141,6 +233,13 @@ int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec); int ocxl_config_terminate_pasid(struct pci_dev *dev, int afu_control_offset, int pasid); +/* + * Read the configuration space of a function and fill in a + * ocxl_fn_config structure with all the function details + */ +int ocxl_config_read_function(struct pci_dev *dev, + struct ocxl_fn_config *fn); + /* * Set up the opencapi link for the function. * From b9721d275cc2c5e6c07371239c827e0faf05a6b9 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:33 +1100 Subject: [PATCH 192/205] ocxl: Allow external drivers to use OpenCAPI contexts Most OpenCAPI operations require a valid context, so exposing these functions to external drivers is necessary. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/context.c | 22 ++++++++++------- drivers/misc/ocxl/file.c | 8 ++----- drivers/misc/ocxl/ocxl_internal.h | 6 ----- include/misc/ocxl.h | 39 +++++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index c73a859d2224f..b60e674ac019a 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -4,15 +4,17 @@ #include "trace.h" #include "ocxl_internal.h" -struct ocxl_context *ocxl_context_alloc(void) -{ - return kzalloc(sizeof(struct ocxl_context), GFP_KERNEL); -} - -int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, +int ocxl_context_alloc(struct ocxl_context **context, struct ocxl_afu *afu, struct address_space *mapping) { int pasid; + struct ocxl_context *ctx; + + *context = kzalloc(sizeof(struct ocxl_context), GFP_KERNEL); + if (!*context) + return -ENOMEM; + + ctx = *context; ctx->afu = afu; mutex_lock(&afu->contexts_lock); @@ -43,6 +45,7 @@ int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, ocxl_afu_get(afu); return 0; } +EXPORT_SYMBOL_GPL(ocxl_context_alloc); /* * Callback for when a translation fault triggers an error @@ -63,7 +66,7 @@ static void xsl_fault_error(void *data, u64 addr, u64 dsisr) wake_up_all(&ctx->events_wq); } -int ocxl_context_attach(struct ocxl_context *ctx, u64 amr) +int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, struct mm_struct *mm) { int rc; @@ -75,7 +78,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr) } rc = ocxl_link_add_pe(ctx->afu->fn->link, ctx->pasid, - current->mm->context.id, ctx->tidr, amr, current->mm, + mm->context.id, ctx->tidr, amr, mm, xsl_fault_error, ctx); if (rc) goto out; @@ -85,6 +88,7 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr) mutex_unlock(&ctx->status_mutex); return rc; } +EXPORT_SYMBOL_GPL(ocxl_context_attach); static vm_fault_t map_afu_irq(struct vm_area_struct *vma, unsigned long address, u64 offset, struct ocxl_context *ctx) @@ -243,6 +247,7 @@ int ocxl_context_detach(struct ocxl_context *ctx) } return 0; } +EXPORT_SYMBOL_GPL(ocxl_context_detach); void ocxl_context_detach_all(struct ocxl_afu *afu) { @@ -280,3 +285,4 @@ void ocxl_context_free(struct ocxl_context *ctx) ocxl_afu_put(ctx->afu); kfree(ctx); } +EXPORT_SYMBOL_GPL(ocxl_context_free); diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 7a38ea5af9db1..8225892a5d770 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -61,11 +61,7 @@ static int afu_open(struct inode *inode, struct file *file) if (!info) return -ENODEV; - ctx = ocxl_context_alloc(); - if (!ctx) - return -ENOMEM; - - rc = ocxl_context_init(ctx, info->afu, inode->i_mapping); + rc = ocxl_context_alloc(&ctx, info->afu, inode->i_mapping); if (rc) return rc; @@ -90,7 +86,7 @@ static long afu_ioctl_attach(struct ocxl_context *ctx, return -EINVAL; amr = arg.amr & mfspr(SPRN_UAMOR); - rc = ocxl_context_attach(ctx, amr); + rc = ocxl_context_attach(ctx, amr, current->mm); return rc; } diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 53b6c64a1bf04..de6c162377424 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -130,15 +130,9 @@ int ocxl_config_check_afu_index(struct pci_dev *dev, */ int ocxl_link_update_pe(void *link_handle, int pasid, __u16 tid); -struct ocxl_context *ocxl_context_alloc(void); -int ocxl_context_init(struct ocxl_context *ctx, struct ocxl_afu *afu, - struct address_space *mapping); -int ocxl_context_attach(struct ocxl_context *ctx, u64 amr); int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma); -int ocxl_context_detach(struct ocxl_context *ctx); void ocxl_context_detach_all(struct ocxl_afu *afu); -void ocxl_context_free(struct ocxl_context *ctx); int ocxl_sysfs_register_afu(struct ocxl_file_info *info); void ocxl_sysfs_unregister_afu(struct ocxl_file_info *info); diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index 8bafd748e380c..e4704632eac5f 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -48,6 +48,7 @@ struct ocxl_fn_config { // These are opaque outside the ocxl driver struct ocxl_afu; struct ocxl_fn; +struct ocxl_context; // Device detection & initialisation @@ -116,6 +117,44 @@ const struct ocxl_fn_config *ocxl_function_config(struct ocxl_fn *fn); */ void ocxl_function_close(struct ocxl_fn *fn); +// Context allocation + +/** + * Allocate an OpenCAPI context + * + * @context: The OpenCAPI context to allocate, must be freed with ocxl_context_free + * @afu: The AFU the context belongs to + * @mapping: The mapping to unmap when the context is closed (may be NULL) + */ +int ocxl_context_alloc(struct ocxl_context **context, struct ocxl_afu *afu, + struct address_space *mapping); + +/** + * Free an OpenCAPI context + * + * @ctx: The OpenCAPI context to free + */ +void ocxl_context_free(struct ocxl_context *ctx); + +/** + * Grant access to an MM to an OpenCAPI context + * @ctx: The OpenCAPI context to attach + * @amr: The value of the AMR register to restrict access + * @mm: The mm to attach to the context + * + * Returns 0 on success, negative on failure + */ +int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, + struct mm_struct *mm); + +/** + * Detach an MM from an OpenCAPI context + * @ctx: The OpenCAPI context to attach + * + * Returns 0 on success, negative on failure + */ +int ocxl_context_detach(struct ocxl_context *ctx); + // AFU Metadata /** From 2ec3b7ed2ab8e07fdb5f800ff420e94dec75435f Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:34 +1100 Subject: [PATCH 193/205] ocxl: afu_irq only deals with IRQ IDs, not offsets The use of offsets is required only in the frontend, so alter the IRQ API to only work with IRQ IDs in the backend. Signed-off-by: Alastair D'Silva Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/afu_irq.c | 34 +++++++++++++++---------------- drivers/misc/ocxl/context.c | 7 +++++-- drivers/misc/ocxl/file.c | 13 +++++++----- drivers/misc/ocxl/ocxl_internal.h | 10 +++++---- drivers/misc/ocxl/trace.h | 12 ++++------- 5 files changed, 39 insertions(+), 37 deletions(-) diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c index 11ab996657a2d..2d410cd6f817f 100644 --- a/drivers/misc/ocxl/afu_irq.c +++ b/drivers/misc/ocxl/afu_irq.c @@ -14,14 +14,14 @@ struct afu_irq { struct eventfd_ctx *ev_ctx; }; -static int irq_offset_to_id(struct ocxl_context *ctx, u64 offset) +int ocxl_irq_offset_to_id(struct ocxl_context *ctx, u64 offset) { return (offset - ctx->afu->irq_base_offset) >> PAGE_SHIFT; } -static u64 irq_id_to_offset(struct ocxl_context *ctx, int id) +u64 ocxl_irq_id_to_offset(struct ocxl_context *ctx, int irq_id) { - return ctx->afu->irq_base_offset + (id << PAGE_SHIFT); + return ctx->afu->irq_base_offset + (irq_id << PAGE_SHIFT); } static irqreturn_t afu_irq_handler(int virq, void *data) @@ -69,7 +69,7 @@ static void release_afu_irq(struct afu_irq *irq) kfree(irq->name); } -int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset) +int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id) { struct afu_irq *irq; int rc; @@ -101,11 +101,11 @@ int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset) if (rc) goto err_alloc; - *irq_offset = irq_id_to_offset(ctx, irq->id); - - trace_ocxl_afu_irq_alloc(ctx->pasid, irq->id, irq->virq, irq->hw_irq, - *irq_offset); + trace_ocxl_afu_irq_alloc(ctx->pasid, irq->id, irq->virq, irq->hw_irq); mutex_unlock(&ctx->irq_lock); + + *irq_id = irq->id; + return 0; err_alloc: @@ -123,7 +123,7 @@ static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx) trace_ocxl_afu_irq_free(ctx->pasid, irq->id); if (ctx->mapping) unmap_mapping_range(ctx->mapping, - irq_id_to_offset(ctx, irq->id), + ocxl_irq_id_to_offset(ctx, irq->id), 1 << PAGE_SHIFT, 1); release_afu_irq(irq); if (irq->ev_ctx) @@ -132,14 +132,13 @@ static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx) kfree(irq); } -int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset) +int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id) { struct afu_irq *irq; - int id = irq_offset_to_id(ctx, irq_offset); mutex_lock(&ctx->irq_lock); - irq = idr_find(&ctx->irq_idr, id); + irq = idr_find(&ctx->irq_idr, irq_id); if (!irq) { mutex_unlock(&ctx->irq_lock); return -EINVAL; @@ -161,14 +160,14 @@ void ocxl_afu_irq_free_all(struct ocxl_context *ctx) mutex_unlock(&ctx->irq_lock); } -int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd) +int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, int irq_id, int eventfd) { struct afu_irq *irq; struct eventfd_ctx *ev_ctx; - int rc = 0, id = irq_offset_to_id(ctx, irq_offset); + int rc = 0; mutex_lock(&ctx->irq_lock); - irq = idr_find(&ctx->irq_idr, id); + irq = idr_find(&ctx->irq_idr, irq_id); if (!irq) { rc = -EINVAL; goto unlock; @@ -186,14 +185,13 @@ int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, int eventfd) return rc; } -u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset) +u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id) { struct afu_irq *irq; - int id = irq_offset_to_id(ctx, irq_offset); u64 addr = 0; mutex_lock(&ctx->irq_lock); - irq = idr_find(&ctx->irq_idr, id); + irq = idr_find(&ctx->irq_idr, irq_id); if (irq) addr = irq->trigger_page; mutex_unlock(&ctx->irq_lock); diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index b60e674ac019a..bab9c9364184e 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -94,8 +94,9 @@ static vm_fault_t map_afu_irq(struct vm_area_struct *vma, unsigned long address, u64 offset, struct ocxl_context *ctx) { u64 trigger_addr; + int irq_id = ocxl_irq_offset_to_id(ctx, offset); - trigger_addr = ocxl_afu_irq_get_addr(ctx, offset); + trigger_addr = ocxl_afu_irq_get_addr(ctx, irq_id); if (!trigger_addr) return VM_FAULT_SIGBUS; @@ -155,12 +156,14 @@ static const struct vm_operations_struct ocxl_vmops = { static int check_mmap_afu_irq(struct ocxl_context *ctx, struct vm_area_struct *vma) { + int irq_id = ocxl_irq_offset_to_id(ctx, vma->vm_pgoff << PAGE_SHIFT); + /* only one page */ if (vma_pages(vma) != 1) return -EINVAL; /* check offset validty */ - if (!ocxl_afu_irq_get_addr(ctx, vma->vm_pgoff << PAGE_SHIFT)) + if (!ocxl_afu_irq_get_addr(ctx, irq_id)) return -EINVAL; /* diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 8225892a5d770..7f5282a3af3a2 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -188,6 +188,7 @@ static long afu_ioctl(struct file *file, unsigned int cmd, { struct ocxl_context *ctx = file->private_data; struct ocxl_ioctl_irq_fd irq_fd; + int irq_id; u64 irq_offset; long rc; bool closed; @@ -209,12 +210,13 @@ static long afu_ioctl(struct file *file, unsigned int cmd, break; case OCXL_IOCTL_IRQ_ALLOC: - rc = ocxl_afu_irq_alloc(ctx, &irq_offset); + rc = ocxl_afu_irq_alloc(ctx, &irq_id); if (!rc) { + irq_offset = ocxl_irq_id_to_offset(ctx, irq_id); rc = copy_to_user((u64 __user *) args, &irq_offset, sizeof(irq_offset)); if (rc) { - ocxl_afu_irq_free(ctx, irq_offset); + ocxl_afu_irq_free(ctx, irq_id); return -EFAULT; } } @@ -225,7 +227,8 @@ static long afu_ioctl(struct file *file, unsigned int cmd, sizeof(irq_offset)); if (rc) return -EFAULT; - rc = ocxl_afu_irq_free(ctx, irq_offset); + irq_id = ocxl_irq_offset_to_id(ctx, irq_offset); + rc = ocxl_afu_irq_free(ctx, irq_id); break; case OCXL_IOCTL_IRQ_SET_FD: @@ -235,8 +238,8 @@ static long afu_ioctl(struct file *file, unsigned int cmd, return -EFAULT; if (irq_fd.reserved) return -EINVAL; - rc = ocxl_afu_irq_set_fd(ctx, irq_fd.irq_offset, - irq_fd.eventfd); + irq_id = ocxl_irq_offset_to_id(ctx, irq_fd.irq_offset); + rc = ocxl_afu_irq_set_fd(ctx, irq_id, irq_fd.eventfd); break; case OCXL_IOCTL_GET_METADATA: diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index de6c162377424..58969467bd5c0 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -137,11 +137,13 @@ void ocxl_context_detach_all(struct ocxl_afu *afu); int ocxl_sysfs_register_afu(struct ocxl_file_info *info); void ocxl_sysfs_unregister_afu(struct ocxl_file_info *info); -int ocxl_afu_irq_alloc(struct ocxl_context *ctx, u64 *irq_offset); -int ocxl_afu_irq_free(struct ocxl_context *ctx, u64 irq_offset); +int ocxl_irq_offset_to_id(struct ocxl_context *ctx, u64 offset); +u64 ocxl_irq_id_to_offset(struct ocxl_context *ctx, int irq_id); +int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id); +int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id); void ocxl_afu_irq_free_all(struct ocxl_context *ctx); -int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, u64 irq_offset, +int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, int irq_id, int eventfd); -u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, u64 irq_offset); +u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id); #endif /* _OCXL_INTERNAL_H_ */ diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h index bcb7ff330c1ee..024f417e7e013 100644 --- a/drivers/misc/ocxl/trace.h +++ b/drivers/misc/ocxl/trace.h @@ -107,16 +107,14 @@ DEFINE_EVENT(ocxl_fault_handler, ocxl_fault_ack, ); TRACE_EVENT(ocxl_afu_irq_alloc, - TP_PROTO(int pasid, int irq_id, unsigned int virq, int hw_irq, - u64 irq_offset), - TP_ARGS(pasid, irq_id, virq, hw_irq, irq_offset), + TP_PROTO(int pasid, int irq_id, unsigned int virq, int hw_irq), + TP_ARGS(pasid, irq_id, virq, hw_irq), TP_STRUCT__entry( __field(int, pasid) __field(int, irq_id) __field(unsigned int, virq) __field(int, hw_irq) - __field(u64, irq_offset) ), TP_fast_assign( @@ -124,15 +122,13 @@ TRACE_EVENT(ocxl_afu_irq_alloc, __entry->irq_id = irq_id; __entry->virq = virq; __entry->hw_irq = hw_irq; - __entry->irq_offset = irq_offset; ), - TP_printk("pasid=0x%x irq_id=%d virq=%u hw_irq=%d irq_offset=0x%llx", + TP_printk("pasid=0x%x irq_id=%d virq=%u hw_irq=%d", __entry->pasid, __entry->irq_id, __entry->virq, - __entry->hw_irq, - __entry->irq_offset + __entry->hw_irq ) ); From 060146614643ddc5978c73ffac0329762b4651c9 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:35 +1100 Subject: [PATCH 194/205] ocxl: move event_fd handling to frontend Event_fd is only used in the driver frontend, so it does not need to exist in the backend code. Relocate it to the frontend and provide an opaque mechanism for consumers instead. Signed-off-by: Alastair D'Silva Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/afu_irq.c | 74 ++++++++++++++++++------------- drivers/misc/ocxl/file.c | 22 ++++++++- drivers/misc/ocxl/ocxl_internal.h | 5 --- include/misc/ocxl.h | 46 +++++++++++++++++++ 4 files changed, 109 insertions(+), 38 deletions(-) diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c index 2d410cd6f817f..70f8f1c3929dc 100644 --- a/drivers/misc/ocxl/afu_irq.c +++ b/drivers/misc/ocxl/afu_irq.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ // Copyright 2017 IBM Corp. #include -#include +#include #include "ocxl_internal.h" #include "trace.h" @@ -11,7 +11,9 @@ struct afu_irq { unsigned int virq; char *name; u64 trigger_page; - struct eventfd_ctx *ev_ctx; + irqreturn_t (*handler)(void *private); + void (*free_private)(void *private); + void *private; }; int ocxl_irq_offset_to_id(struct ocxl_context *ctx, u64 offset) @@ -24,14 +26,44 @@ u64 ocxl_irq_id_to_offset(struct ocxl_context *ctx, int irq_id) return ctx->afu->irq_base_offset + (irq_id << PAGE_SHIFT); } +int ocxl_irq_set_handler(struct ocxl_context *ctx, int irq_id, + irqreturn_t (*handler)(void *private), + void (*free_private)(void *private), + void *private) +{ + struct afu_irq *irq; + int rc; + + mutex_lock(&ctx->irq_lock); + irq = idr_find(&ctx->irq_idr, irq_id); + if (!irq) { + rc = -EINVAL; + goto unlock; + } + + irq->handler = handler; + irq->private = private; + irq->free_private = free_private; + + rc = 0; + // Fall through to unlock + +unlock: + mutex_unlock(&ctx->irq_lock); + return rc; +} +EXPORT_SYMBOL_GPL(ocxl_irq_set_handler); + static irqreturn_t afu_irq_handler(int virq, void *data) { struct afu_irq *irq = (struct afu_irq *) data; trace_ocxl_afu_irq_receive(virq); - if (irq->ev_ctx) - eventfd_signal(irq->ev_ctx, 1); - return IRQ_HANDLED; + + if (irq->handler) + return irq->handler(irq->private); + + return IRQ_HANDLED; // Just drop it on the ground } static int setup_afu_irq(struct ocxl_context *ctx, struct afu_irq *irq) @@ -117,6 +149,7 @@ int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id) kfree(irq); return rc; } +EXPORT_SYMBOL_GPL(ocxl_afu_irq_alloc); static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx) { @@ -126,8 +159,8 @@ static void afu_irq_free(struct afu_irq *irq, struct ocxl_context *ctx) ocxl_irq_id_to_offset(ctx, irq->id), 1 << PAGE_SHIFT, 1); release_afu_irq(irq); - if (irq->ev_ctx) - eventfd_ctx_put(irq->ev_ctx); + if (irq->free_private) + irq->free_private(irq->private); ocxl_link_free_irq(ctx->afu->fn->link, irq->hw_irq); kfree(irq); } @@ -148,6 +181,7 @@ int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id) mutex_unlock(&ctx->irq_lock); return 0; } +EXPORT_SYMBOL_GPL(ocxl_afu_irq_free); void ocxl_afu_irq_free_all(struct ocxl_context *ctx) { @@ -160,31 +194,6 @@ void ocxl_afu_irq_free_all(struct ocxl_context *ctx) mutex_unlock(&ctx->irq_lock); } -int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, int irq_id, int eventfd) -{ - struct afu_irq *irq; - struct eventfd_ctx *ev_ctx; - int rc = 0; - - mutex_lock(&ctx->irq_lock); - irq = idr_find(&ctx->irq_idr, irq_id); - if (!irq) { - rc = -EINVAL; - goto unlock; - } - - ev_ctx = eventfd_ctx_fdget(eventfd); - if (IS_ERR(ev_ctx)) { - rc = -EINVAL; - goto unlock; - } - - irq->ev_ctx = ev_ctx; -unlock: - mutex_unlock(&ctx->irq_lock); - return rc; -} - u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id) { struct afu_irq *irq; @@ -197,3 +206,4 @@ u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id) mutex_unlock(&ctx->irq_lock); return addr; } +EXPORT_SYMBOL_GPL(ocxl_afu_irq_get_addr); diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 7f5282a3af3a2..8aa22893ed768 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -183,11 +184,27 @@ static long afu_ioctl_get_features(struct ocxl_context *ctx, x == OCXL_IOCTL_GET_FEATURES ? "GET_FEATURES" : \ "UNKNOWN") +static irqreturn_t irq_handler(void *private) +{ + struct eventfd_ctx *ev_ctx = private; + + eventfd_signal(ev_ctx, 1); + return IRQ_HANDLED; +} + +static void irq_free(void *private) +{ + struct eventfd_ctx *ev_ctx = private; + + eventfd_ctx_put(ev_ctx); +} + static long afu_ioctl(struct file *file, unsigned int cmd, unsigned long args) { struct ocxl_context *ctx = file->private_data; struct ocxl_ioctl_irq_fd irq_fd; + struct eventfd_ctx *ev_ctx; int irq_id; u64 irq_offset; long rc; @@ -239,7 +256,10 @@ static long afu_ioctl(struct file *file, unsigned int cmd, if (irq_fd.reserved) return -EINVAL; irq_id = ocxl_irq_offset_to_id(ctx, irq_fd.irq_offset); - rc = ocxl_afu_irq_set_fd(ctx, irq_id, irq_fd.eventfd); + ev_ctx = eventfd_ctx_fdget(irq_fd.eventfd); + if (!ev_ctx) + return -EFAULT; + rc = ocxl_irq_set_handler(ctx, irq_id, irq_handler, irq_free, ev_ctx); break; case OCXL_IOCTL_GET_METADATA: diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 58969467bd5c0..97415afd79f3b 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -139,11 +139,6 @@ void ocxl_sysfs_unregister_afu(struct ocxl_file_info *info); int ocxl_irq_offset_to_id(struct ocxl_context *ctx, u64 offset); u64 ocxl_irq_id_to_offset(struct ocxl_context *ctx, int irq_id); -int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id); -int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id); void ocxl_afu_irq_free_all(struct ocxl_context *ctx); -int ocxl_afu_irq_set_fd(struct ocxl_context *ctx, int irq_id, - int eventfd); -u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id); #endif /* _OCXL_INTERNAL_H_ */ diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index e4704632eac5f..dea93885a839c 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -155,6 +155,52 @@ int ocxl_context_attach(struct ocxl_context *ctx, u64 amr, */ int ocxl_context_detach(struct ocxl_context *ctx); +// AFU IRQs + +/** + * Allocate an IRQ associated with an AFU context + * @ctx: the AFU context + * @irq_id: out, the IRQ ID + * + * Returns 0 on success, negative on failure + */ +extern int ocxl_afu_irq_alloc(struct ocxl_context *ctx, int *irq_id); + +/** + * Frees an IRQ associated with an AFU context + * @ctx: the AFU context + * @irq_id: the IRQ ID + * + * Returns 0 on success, negative on failure + */ +extern int ocxl_afu_irq_free(struct ocxl_context *ctx, int irq_id); + +/** + * Gets the address of the trigger page for an IRQ + * This can then be provided to an AFU which will write to that + * page to trigger the IRQ. + * @ctx: The AFU context that the IRQ is associated with + * @irq_id: The IRQ ID + * + * returns the trigger page address, or 0 if the IRQ is not valid + */ +extern u64 ocxl_afu_irq_get_addr(struct ocxl_context *ctx, int irq_id); + +/** + * Provide a callback to be called when an IRQ is triggered + * @ctx: The AFU context that the IRQ is associated with + * @irq_id: The IRQ ID + * @handler: the callback to be called when the IRQ is triggered + * @free_private: the callback to be called when the IRQ is freed (may be NULL) + * @private: Private data to be passed to the callbacks + * + * Returns 0 on success, negative on failure + */ +int ocxl_irq_set_handler(struct ocxl_context *ctx, int irq_id, + irqreturn_t (*handler)(void *private), + void (*free_private)(void *private), + void *private); + // AFU Metadata /** From 7e462c2a8a6d00d3c240cac9f5626eff96d8e641 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Wed, 27 Mar 2019 16:31:36 +1100 Subject: [PATCH 195/205] ocxl: Provide global MMIO accessors for external drivers External drivers that communicate via OpenCAPI will need to make MMIO calls to interact with the devices. Signed-off-by: Alastair D'Silva Reviewed-by: Greg Kurz Acked-by: Frederic Barrat Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/Makefile | 2 +- drivers/misc/ocxl/mmio.c | 234 +++++++++++++++++++++++++++++++++++++ include/misc/ocxl.h | 110 +++++++++++++++++ 3 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 drivers/misc/ocxl/mmio.c diff --git a/drivers/misc/ocxl/Makefile b/drivers/misc/ocxl/Makefile index bc4e39bfda7bf..d07d1bb8e8d47 100644 --- a/drivers/misc/ocxl/Makefile +++ b/drivers/misc/ocxl/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ ccflags-$(CONFIG_PPC_WERROR) += -Werror -ocxl-y += main.o pci.o config.o file.o pasid.o +ocxl-y += main.o pci.o config.o file.o pasid.o mmio.o ocxl-y += link.o context.o afu_irq.o sysfs.o trace.o ocxl-y += core.o obj-$(CONFIG_OCXL) += ocxl.o diff --git a/drivers/misc/ocxl/mmio.c b/drivers/misc/ocxl/mmio.c new file mode 100644 index 0000000000000..aae713db4ebe8 --- /dev/null +++ b/drivers/misc/ocxl/mmio.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright 2019 IBM Corp. +#include +#include "trace.h" +#include "ocxl_internal.h" + +int ocxl_global_mmio_read32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 *val) +{ + if (offset > afu->config.global_mmio_size - 4) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + *val = readl_be((char *)afu->global_mmio_ptr + offset); + break; + + default: + *val = readl((char *)afu->global_mmio_ptr + offset); + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_read32); + +int ocxl_global_mmio_read64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 *val) +{ + if (offset > afu->config.global_mmio_size - 8) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + *val = readq_be((char *)afu->global_mmio_ptr + offset); + break; + + default: + *val = readq((char *)afu->global_mmio_ptr + offset); + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_read64); + +int ocxl_global_mmio_write32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 val) +{ + if (offset > afu->config.global_mmio_size - 4) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + writel_be(val, (char *)afu->global_mmio_ptr + offset); + break; + + default: + writel(val, (char *)afu->global_mmio_ptr + offset); + break; + } + + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_write32); + +int ocxl_global_mmio_write64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 val) +{ + if (offset > afu->config.global_mmio_size - 8) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + writeq_be(val, (char *)afu->global_mmio_ptr + offset); + break; + + default: + writeq(val, (char *)afu->global_mmio_ptr + offset); + break; + } + + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_write64); + +int ocxl_global_mmio_set32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 mask) +{ + u32 tmp; + + if (offset > afu->config.global_mmio_size - 4) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + tmp = readl_be((char *)afu->global_mmio_ptr + offset); + tmp |= mask; + writel_be(tmp, (char *)afu->global_mmio_ptr + offset); + break; + + default: + tmp = readl((char *)afu->global_mmio_ptr + offset); + tmp |= mask; + writel(tmp, (char *)afu->global_mmio_ptr + offset); + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_set32); + +int ocxl_global_mmio_set64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 mask) +{ + u64 tmp; + + if (offset > afu->config.global_mmio_size - 8) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + tmp = readq_be((char *)afu->global_mmio_ptr + offset); + tmp |= mask; + writeq_be(tmp, (char *)afu->global_mmio_ptr + offset); + break; + + default: + tmp = readq((char *)afu->global_mmio_ptr + offset); + tmp |= mask; + writeq(tmp, (char *)afu->global_mmio_ptr + offset); + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_set64); + +int ocxl_global_mmio_clear32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 mask) +{ + u32 tmp; + + if (offset > afu->config.global_mmio_size - 4) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + tmp = readl_be((char *)afu->global_mmio_ptr + offset); + tmp &= ~mask; + writel_be(tmp, (char *)afu->global_mmio_ptr + offset); + break; + + default: + tmp = readl((char *)afu->global_mmio_ptr + offset); + tmp &= ~mask; + writel(tmp, (char *)afu->global_mmio_ptr + offset); + break; + } + + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_clear32); + +int ocxl_global_mmio_clear64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 mask) +{ + u64 tmp; + + if (offset > afu->config.global_mmio_size - 8) + return -EINVAL; + +#ifdef __BIG_ENDIAN__ + if (endian == OCXL_HOST_ENDIAN) + endian = OCXL_BIG_ENDIAN; +#endif + + switch (endian) { + case OCXL_BIG_ENDIAN: + tmp = readq_be((char *)afu->global_mmio_ptr + offset); + tmp &= ~mask; + writeq_be(tmp, (char *)afu->global_mmio_ptr + offset); + break; + + default: + tmp = readq((char *)afu->global_mmio_ptr + offset); + tmp &= ~mask; + writeq(tmp, (char *)afu->global_mmio_ptr + offset); + break; + } + + writeq(tmp, (char *)afu->global_mmio_ptr + offset); + + return 0; +} +EXPORT_SYMBOL_GPL(ocxl_global_mmio_clear64); diff --git a/include/misc/ocxl.h b/include/misc/ocxl.h index dea93885a839c..5c4b4916e6be0 100644 --- a/include/misc/ocxl.h +++ b/include/misc/ocxl.h @@ -45,6 +45,12 @@ struct ocxl_fn_config { s8 max_afu_index; }; +enum ocxl_endian { + OCXL_BIG_ENDIAN = 0, /**< AFU data is big-endian */ + OCXL_LITTLE_ENDIAN = 1, /**< AFU data is little-endian */ + OCXL_HOST_ENDIAN = 2, /**< AFU data is the same endianness as the host */ +}; + // These are opaque outside the ocxl driver struct ocxl_afu; struct ocxl_fn; @@ -230,6 +236,110 @@ void ocxl_afu_set_private(struct ocxl_afu *afu, void *private); */ void *ocxl_afu_get_private(struct ocxl_afu *dev); +// Global MMIO +/** + * Read a 32 bit value from global MMIO + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @val: returns the value + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_read32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 *val); + +/** + * Read a 64 bit value from global MMIO + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @val: returns the value + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_read64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 *val); + +/** + * Write a 32 bit value to global MMIO + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @val: The value to write + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_write32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 val); + +/** + * Write a 64 bit value to global MMIO + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @val: The value to write + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_write64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 val); + +/** + * Set bits in a 32 bit global MMIO register + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @mask: a mask of the bits to set + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_set32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 mask); + +/** + * Set bits in a 64 bit global MMIO register + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @mask: a mask of the bits to set + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_set64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 mask); + +/** + * Set bits in a 32 bit global MMIO register + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @mask: a mask of the bits to set + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_clear32(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u32 mask); + +/** + * Set bits in a 64 bit global MMIO register + * + * @afu: The AFU + * @offset: The Offset from the start of MMIO + * @endian: the endianness that the MMIO data is in + * @mask: a mask of the bits to set + * + * Returns 0 for success, negative on error + */ +int ocxl_global_mmio_clear64(struct ocxl_afu *afu, size_t offset, + enum ocxl_endian endian, u64 mask); // Functions left here are for compatibility with the cxlflash driver From 5266e58d6cd90ac85c187d673093ad9cb649e16d Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Mon, 15 Apr 2019 14:52:11 +0300 Subject: [PATCH 196/205] powerpc/booke64: set RI in default MSR Set RI in the default kernel's MSR so that the architected way of detecting unrecoverable machine check interrupts has a chance to work. This is inline with the MSR setup of the rest of booke powerpc architectures configured here. Signed-off-by: Laurentiu Tudor Cc: stable@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg_booke.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index eb2a33d5df26b..e382bd6ede845 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -41,7 +41,7 @@ #if defined(CONFIG_PPC_BOOK3E_64) #define MSR_64BIT MSR_CM -#define MSR_ (MSR_ME | MSR_CE) +#define MSR_ (MSR_ME | MSR_RI | MSR_CE) #define MSR_KERNEL (MSR_ | MSR_64BIT) #define MSR_USER32 (MSR_ | MSR_PR | MSR_EE) #define MSR_USER64 (MSR_USER32 | MSR_64BIT) From 83e367f9ad18d42a1883ee29f20608a2b93e1071 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 17 Jan 2019 15:01:54 -0200 Subject: [PATCH 197/205] selftests/powerpc: Add a signal fuzzer selftest This is a new selftest that raises SIGUSR1 signals and handles it in a set of different ways, trying to create different scenario for testing purpose. This test works raising a signal and calling sigreturn interleaved with TM operations, as starting, suspending and terminating a transaction. The test depends on random numbers, and, based on them, it sets different TM states. Other than that, the test fills out the user context struct that is passed to the sigreturn system call with random data, in order to make sure that the signal handler syscall can handle different and invalid states properly. This selftest has command line parameters to control what kind of tests the user wants to run, as for example, if a transaction should be started prior to signal being raised, or, after the signal being raised and before the sigreturn. If no parameter is given, the default is enabling all options. This test does not check if the user context is being read and set properly by the kernel. Its purpose, at this time, is basically guaranteeing that the kernel does not crash on invalid scenarios. Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/harness.c | 6 +- tools/testing/selftests/powerpc/include/reg.h | 2 + .../selftests/powerpc/signal/.gitignore | 1 + .../testing/selftests/powerpc/signal/Makefile | 3 +- .../testing/selftests/powerpc/signal/sigfuz.c | 325 ++++++++++++++++++ 5 files changed, 334 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/powerpc/signal/sigfuz.c diff --git a/tools/testing/selftests/powerpc/harness.c b/tools/testing/selftests/powerpc/harness.c index 9d7166dfad1ea..ba89353abfccb 100644 --- a/tools/testing/selftests/powerpc/harness.c +++ b/tools/testing/selftests/powerpc/harness.c @@ -21,6 +21,7 @@ #define KILL_TIMEOUT 5 +/* Setting timeout to -1 disables the alarm */ static uint64_t timeout = 120; int run_test(int (test_function)(void), char *name) @@ -43,8 +44,9 @@ int run_test(int (test_function)(void), char *name) setpgid(pid, pid); - /* Wake us up in timeout seconds */ - alarm(timeout); + if (timeout != -1) + /* Wake us up in timeout seconds */ + alarm(timeout); terminated = false; wait: diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h index 96043b9b9829e..1e797ae396ee9 100644 --- a/tools/testing/selftests/powerpc/include/reg.h +++ b/tools/testing/selftests/powerpc/include/reg.h @@ -79,11 +79,13 @@ /* MSR register bits */ #define MSR_TS_S_LG 33 /* Trans Mem state: Suspended */ +#define MSR_TS_T_LG 34 /* Trans Mem state: Active */ #define __MASK(X) (1UL<<(X)) /* macro to check TM MSR bits */ #define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */ +#define MSR_TS_T __MASK(MSR_TS_T_LG) /* Transaction Transactional */ /* Vector Instructions */ #define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \ diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore index 1b89224a8aab0..dca5852a15468 100644 --- a/tools/testing/selftests/powerpc/signal/.gitignore +++ b/tools/testing/selftests/powerpc/signal/.gitignore @@ -1,2 +1,3 @@ signal signal_tm +sigfuz diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 209a958dca127..113838fbbe7f7 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := signal signal_tm +TEST_GEN_PROGS := signal signal_tm sigfuz CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm +$(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 top_srcdir = ../../../../.. include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/signal/sigfuz.c b/tools/testing/selftests/powerpc/signal/sigfuz.c new file mode 100644 index 0000000000000..dade00c698c28 --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sigfuz.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2018, Breno Leitao, IBM Corp. + * Licensed under GPLv2. + * + * Sigfuz(tm): A PowerPC TM-aware signal fuzzer. + * + * This is a new selftest that raises SIGUSR1 signals and handles it in a set + * of different ways, trying to create different scenario for testing + * purpose. + * + * This test works raising a signal and calling sigreturn interleaved with + * TM operations, as starting, suspending and terminating a transaction. The + * test depends on random numbers, and, based on them, it sets different TM + * states. + * + * Other than that, the test fills out the user context struct that is passed + * to the sigreturn system call with random data, in order to make sure that + * the signal handler syscall can handle different and invalid states + * properly. + * + * This selftest has command line parameters to control what kind of tests the + * user wants to run, as for example, if a transaction should be started prior + * to signal being raised, or, after the signal being raised and before the + * sigreturn. If no parameter is given, the default is enabling all options. + * + * This test does not check if the user context is being read and set + * properly by the kernel. Its purpose, at this time, is basically + * guaranteeing that the kernel does not crash on invalid scenarios. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" + +/* Selftest defaults */ +#define COUNT_MAX 4000 /* Number of interactions */ +#define THREADS 16 /* Number of threads */ + +/* Arguments options */ +#define ARG_MESS_WITH_TM_AT 0x1 +#define ARG_MESS_WITH_TM_BEFORE 0x2 +#define ARG_MESS_WITH_MSR_AT 0x4 +#define ARG_FOREVER 0x10 +#define ARG_COMPLETE (ARG_MESS_WITH_TM_AT | \ + ARG_MESS_WITH_TM_BEFORE | \ + ARG_MESS_WITH_MSR_AT) + +static int args; +static int nthread = THREADS; +static int count_max = COUNT_MAX; + +/* checkpoint context */ +static ucontext_t *tmp_uc; + +/* Return true with 1/x probability */ +static int one_in_chance(int x) +{ + return rand() % x == 0; +} + +/* Change TM states */ +static void mess_with_tm(void) +{ + /* Starts a transaction 33% of the time */ + if (one_in_chance(3)) { + asm ("tbegin. ;" + "beq 8 ;"); + + /* And suspended half of them */ + if (one_in_chance(2)) + asm("tsuspend. ;"); + } + + /* Call 'tend' in 5% of the runs */ + if (one_in_chance(20)) + asm("tend. ;"); +} + +/* Signal handler that will be invoked with raise() */ +static void trap_signal_handler(int signo, siginfo_t *si, void *uc) +{ + ucontext_t *ucp = uc; + + ucp->uc_link = tmp_uc; + + /* + * Set uc_link in three possible ways: + * - Setting a single 'int' in the whole chunk + * - Cloning ucp into uc_link + * - Allocating a new memory chunk + */ + if (one_in_chance(3)) { + memset(ucp->uc_link, rand(), sizeof(ucontext_t)); + } else if (one_in_chance(2)) { + memcpy(ucp->uc_link, uc, sizeof(ucontext_t)); + } else if (one_in_chance(2)) { + if (tmp_uc) { + free(tmp_uc); + tmp_uc = NULL; + } + tmp_uc = malloc(sizeof(ucontext_t)); + ucp->uc_link = tmp_uc; + /* Trying to cause a major page fault at Kernel level */ + madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED); + } + + if (args & ARG_MESS_WITH_MSR_AT) { + /* Changing the checkpointed registers */ + if (one_in_chance(4)) { + ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S; + } else { + if (one_in_chance(2)) { + ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |= + MSR_TS_T; + } else if (one_in_chance(2)) { + ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |= + MSR_TS_T | MSR_TS_S; + } + } + + /* Checking the current register context */ + if (one_in_chance(2)) { + ucp->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S; + } else if (one_in_chance(2)) { + if (one_in_chance(2)) + ucp->uc_mcontext.gp_regs[PT_MSR] |= + MSR_TS_T; + else if (one_in_chance(2)) + ucp->uc_mcontext.gp_regs[PT_MSR] |= + MSR_TS_T | MSR_TS_S; + } + } + + if (one_in_chance(20)) { + /* Nested transaction start */ + if (one_in_chance(5)) + mess_with_tm(); + + /* Return without changing any other context info */ + return; + } + + if (one_in_chance(10)) + ucp->uc_mcontext.gp_regs[PT_MSR] = random(); + if (one_in_chance(10)) + ucp->uc_mcontext.gp_regs[PT_NIP] = random(); + if (one_in_chance(10)) + ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] = random(); + if (one_in_chance(10)) + ucp->uc_link->uc_mcontext.gp_regs[PT_NIP] = random(); + + ucp->uc_mcontext.gp_regs[PT_TRAP] = random(); + ucp->uc_mcontext.gp_regs[PT_DSISR] = random(); + ucp->uc_mcontext.gp_regs[PT_DAR] = random(); + ucp->uc_mcontext.gp_regs[PT_ORIG_R3] = random(); + ucp->uc_mcontext.gp_regs[PT_XER] = random(); + ucp->uc_mcontext.gp_regs[PT_RESULT] = random(); + ucp->uc_mcontext.gp_regs[PT_SOFTE] = random(); + ucp->uc_mcontext.gp_regs[PT_DSCR] = random(); + ucp->uc_mcontext.gp_regs[PT_CTR] = random(); + ucp->uc_mcontext.gp_regs[PT_LNK] = random(); + ucp->uc_mcontext.gp_regs[PT_CCR] = random(); + ucp->uc_mcontext.gp_regs[PT_REGS_COUNT] = random(); + + ucp->uc_link->uc_mcontext.gp_regs[PT_TRAP] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_DSISR] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_DAR] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_ORIG_R3] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_XER] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_RESULT] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_SOFTE] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_DSCR] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_CTR] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_LNK] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_CCR] = random(); + ucp->uc_link->uc_mcontext.gp_regs[PT_REGS_COUNT] = random(); + + if (args & ARG_MESS_WITH_TM_BEFORE) { + if (one_in_chance(2)) + mess_with_tm(); + } +} + +static void seg_signal_handler(int signo, siginfo_t *si, void *uc) +{ + /* Clear exit for process that segfaults */ + exit(0); +} + +static void *sigfuz_test(void *thrid) +{ + struct sigaction trap_sa, seg_sa; + int ret, i = 0; + pid_t t; + + tmp_uc = malloc(sizeof(ucontext_t)); + + /* Main signal handler */ + trap_sa.sa_flags = SA_SIGINFO; + trap_sa.sa_sigaction = trap_signal_handler; + + /* SIGSEGV signal handler */ + seg_sa.sa_flags = SA_SIGINFO; + seg_sa.sa_sigaction = seg_signal_handler; + + /* The signal handler will enable MSR_TS */ + sigaction(SIGUSR1, &trap_sa, NULL); + + /* If it does not crash, it will segfault, avoid it to retest */ + sigaction(SIGSEGV, &seg_sa, NULL); + + while (i < count_max) { + t = fork(); + + if (t == 0) { + /* Once seed per process */ + srand(time(NULL) + getpid()); + if (args & ARG_MESS_WITH_TM_AT) { + if (one_in_chance(2)) + mess_with_tm(); + } + raise(SIGUSR1); + exit(0); + } else { + waitpid(t, &ret, 0); + } + if (!(args & ARG_FOREVER)) + i++; + } + + /* If not freed already, free now */ + if (tmp_uc) { + free(tmp_uc); + tmp_uc = NULL; + } + + return NULL; +} + +static int signal_fuzzer(void) +{ + int t, rc; + pthread_t *threads; + + threads = malloc(nthread * sizeof(pthread_t)); + + for (t = 0; t < nthread; t++) { + rc = pthread_create(&threads[t], NULL, sigfuz_test, + (void *)&t); + if (rc) + perror("Thread creation error\n"); + } + + for (t = 0; t < nthread; t++) { + rc = pthread_join(threads[t], NULL); + if (rc) + perror("Thread join error\n"); + } + + free(threads); + + return EXIT_SUCCESS; +} + +static void show_help(char *name) +{ + printf("%s: Sigfuzzer for powerpc\n", name); + printf("Usage:\n"); + printf("\t-b\t Mess with TM before raising a SIGUSR1 signal\n"); + printf("\t-a\t Mess with TM after raising a SIGUSR1 signal\n"); + printf("\t-m\t Mess with MSR[TS] bits at mcontext\n"); + printf("\t-x\t Mess with everything above\n"); + printf("\t-f\t Run forever (Press ^C to Quit)\n"); + printf("\t-i\t Amount of interactions. (Default = %d)\n", COUNT_MAX); + printf("\t-t\t Amount of threads. (Default = %d)\n", THREADS); + exit(-1); +} + +int main(int argc, char **argv) +{ + int opt; + + while ((opt = getopt(argc, argv, "bamxt:fi:h")) != -1) { + if (opt == 'b') { + printf("Mess with TM before signal\n"); + args |= ARG_MESS_WITH_TM_BEFORE; + } else if (opt == 'a') { + printf("Mess with TM at signal handler\n"); + args |= ARG_MESS_WITH_TM_AT; + } else if (opt == 'm') { + printf("Mess with MSR[TS] bits in mcontext\n"); + args |= ARG_MESS_WITH_MSR_AT; + } else if (opt == 'x') { + printf("Running with all options enabled\n"); + args |= ARG_COMPLETE; + } else if (opt == 't') { + nthread = atoi(optarg); + printf("Threads = %d\n", nthread); + } else if (opt == 'f') { + args |= ARG_FOREVER; + printf("Press ^C to stop\n"); + test_harness_set_timeout(-1); + } else if (opt == 'i') { + count_max = atoi(optarg); + printf("Running for %d interactions\n", count_max); + } else if (opt == 'h') { + show_help(argv[0]); + } + } + + /* Default test suite */ + if (!args) + args = ARG_COMPLETE; + + test_harness(signal_fuzzer, "signal_fuzzer"); +} From 305d60012304684bd59ea1f67703e51662e4906a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:21:00 +0000 Subject: [PATCH 198/205] powerpc/kasan: add missing/lost Makefile For unknown reason (aka. mpe is a doofus), the new Makefile added via the KASAN support patch didn't land into arch/powerpc/mm/kasan/ This patch restores it. Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/kasan/Makefile | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 arch/powerpc/mm/kasan/Makefile diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile new file mode 100644 index 0000000000000..6577897673dda --- /dev/null +++ b/arch/powerpc/mm/kasan/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +KASAN_SANITIZE := n + +obj-$(CONFIG_PPC32) += kasan_init_32.o From 471e475c69a1689e059b5e57e893a7da75d2831a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:21:01 +0000 Subject: [PATCH 199/205] powerpc/mm: Fix makefile for KASAN In commit 17312f258cf6 ("powerpc/mm: Move book3s32 specifics in subdirectory mm/book3s64"), ppc_mmu_32.c was moved and renamed. This patch fixes Makefiles to disable KASAN instrumentation on the new name and location. Fixes: f072015c7b74 ("powerpc: disable KASAN instrumentation on early/critical files.") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 6 ------ arch/powerpc/mm/book3s32/Makefile | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d8c0ce9b25576..7a7527116c3ae 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -5,12 +5,6 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -KASAN_SANITIZE_ppc_mmu_32.o := n - -ifdef CONFIG_KASAN -CFLAGS_ppc_mmu_32.o += -DDISABLE_BRANCH_PROFILING -endif - obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o \ diff --git a/arch/powerpc/mm/book3s32/Makefile b/arch/powerpc/mm/book3s32/Makefile index a4e217d0f3b73..1732eaa740a99 100644 --- a/arch/powerpc/mm/book3s32/Makefile +++ b/arch/powerpc/mm/book3s32/Makefile @@ -1,3 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 +KASAN_SANITIZE_mmu.o := n + +ifdef CONFIG_KASAN +CFLAGS_mmu.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += mmu.o hash_low.o mmu_context.o tlb.o From c4e31847a5490d52ddd44440a524e8355be11ec1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 06:47:55 +0000 Subject: [PATCH 200/205] powerpc/mm: fix redundant inclusion of pgtable-frag.o in Makefile The patch identified below added pgtable-frag.o to obj-y but some merge witchery kept it also for obj-CONFIG_PPC_BOOK3S_64 This patch clears the duplication. Fixes: 737b434d3d55 ("powerpc/mm: convert Book3E 64 to pte_fragment") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 7a7527116c3ae..0f499db315d60 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -12,7 +12,6 @@ obj-y := fault.o mem.o pgtable.o mmap.o \ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ -obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-frag.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o From 67d53f30e23ec66aa7bbdd1592d5e64d46876190 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 6 May 2019 08:10:43 +0000 Subject: [PATCH 201/205] powerpc/mm: fix section mismatch for setup_kup() commit b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs") moved setup_kup() out of the __init section. As stated in that commit, "this is only for 64-bit". But this function is also used on PPC32, where the two functions called by setup_kup() are in the __init section, so setup_kup() has to either be kept in the __init section on PPC32 or marked __ref. This patch marks it __ref, it fixes the below build warnings. MODPOST vmlinux.o WARNING: vmlinux.o(.text+0x169ec): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuep() The function setup_kup() references the function __init setup_kuep(). This is often because setup_kup lacks a __init annotation or the annotation of setup_kuep is wrong. WARNING: vmlinux.o(.text+0x16a04): Section mismatch in reference from the function setup_kup() to the function .init.text:setup_kuap() The function setup_kup() references the function __init setup_kuap(). This is often because setup_kup lacks a __init annotation or the annotation of setup_kuap is wrong. Fixes: b28c97505eb1 ("powerpc/64: Setup KUP on secondary CPUs") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index 6ea5607fc5646..3bcae9e5e9545 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -45,7 +45,7 @@ static int __init parse_nosmap(char *p) } early_param("nosmap", parse_nosmap); -void setup_kup(void) +void __ref setup_kup(void) { setup_kuep(disable_kuep); setup_kuap(disable_kuap); From 6be6a8de1b55e719e3f95894910743719065d6a1 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 4 May 2019 07:04:30 +0000 Subject: [PATCH 202/205] ocxl: Fix return value check in afu_ioctl() In case of error, the function eventfd_ctx_fdget() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). This issue was detected by using the Coccinelle software. Fixes: 060146614643 ("ocxl: move event_fd handling to frontend") Signed-off-by: Wei Yongjun Acked-by: Alastair D'Silva Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- drivers/misc/ocxl/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index 8aa22893ed768..2870c25da166f 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -257,8 +257,8 @@ static long afu_ioctl(struct file *file, unsigned int cmd, return -EINVAL; irq_id = ocxl_irq_offset_to_id(ctx, irq_fd.irq_offset); ev_ctx = eventfd_ctx_fdget(irq_fd.eventfd); - if (!ev_ctx) - return -EFAULT; + if (IS_ERR(ev_ctx)) + return PTR_ERR(ev_ctx); rc = ocxl_irq_set_handler(ctx, irq_id, irq_handler, irq_free, ev_ctx); break; From 04a1942933ced67d2b73c156017bf13476b7146b Mon Sep 17 00:00:00 2001 From: Sachin Sant Date: Mon, 6 May 2019 17:33:33 +0530 Subject: [PATCH 203/205] powerpc/mm: Fix hugetlb page initialization This patch fixes a regression by using correct kernel config variable for HUGETLB_PAGE_SIZE_VARIABLE. Without this huge pages are disabled during kernel boot. [0.309496] hugetlbfs: disabling because there are no supported hugepage sizes Fixes: c5710cd20735 ("powerpc/mm: cleanup HPAGE_SHIFT setup") Reported-by: Sachin Sant Signed-off-by: Michael Ellerman Tested-by: Sachin Sant Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 98db5ec6a1ddf..c5c9ff2d7afcb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -640,7 +640,7 @@ static int __init hugetlbpage_init(void) pgtable_cache_add(PTE_T_ORDER); } - if (IS_ENABLED(HUGETLB_PAGE_SIZE_VARIABLE)) + if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE)) hugetlbpage_init_default(); return 0; From f39356261c265a0689d7ee568132d516e8b6cecc Mon Sep 17 00:00:00 2001 From: Rick Lindsley Date: Sun, 5 May 2019 17:20:43 -0700 Subject: [PATCH 204/205] powerpc/book3s/64: check for NULL pointer in pgd_alloc() When the memset code was added to pgd_alloc(), it failed to consider that kmem_cache_alloc() can return NULL. It's uncommon, but not impossible under heavy memory contention. Example oops: Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc0000000000a4000 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries CPU: 70 PID: 48471 Comm: entrypoint.sh Kdump: loaded Not tainted 4.14.0-115.6.1.el7a.ppc64le #1 task: c000000334a00000 task.stack: c000000331c00000 NIP: c0000000000a4000 LR: c00000000012f43c CTR: 0000000000000020 REGS: c000000331c039c0 TRAP: 0300 Not tainted (4.14.0-115.6.1.el7a.ppc64le) MSR: 800000010280b033 CR: 44022840 XER: 20040000 CFAR: c000000000008874 DAR: 0000000000000000 DSISR: 42000000 SOFTE: 1 ... NIP [c0000000000a4000] memset+0x68/0x104 LR [c00000000012f43c] mm_init+0x27c/0x2f0 Call Trace: mm_init+0x260/0x2f0 (unreliable) copy_mm+0x11c/0x638 copy_process.isra.28.part.29+0x6fc/0x1080 _do_fork+0xdc/0x4c0 ppc_clone+0x8/0xc Instruction dump: 409e000c b0860000 38c60002 409d000c 90860000 38c60004 78a0d183 78a506a0 7c0903a6 41820034 60000000 60420000 f8860008 f8860010 f8860018 Fixes: fc5c2f4a55a2 ("powerpc/mm/hash64: Zero PGD pages on allocation") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Rick Lindsley Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgalloc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 053a7940504ef..d45e4449619f5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -59,6 +59,9 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd = kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), pgtable_gfp_flags(mm, GFP_KERNEL)); + if (unlikely(!pgd)) + return pgd; + /* * Don't scan the PGD for pointers, it contains references to PUDs but * those references are not full pointers and so can't be recognised by From 8150a153c013aa2dd1ffae43370b89ac1347a7fb Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 8 May 2019 13:06:42 +1000 Subject: [PATCH 205/205] powerpc/64s: Use early_mmu_has_feature() in set_kuap() When implementing the KUAP support on Radix we fixed one case where mmu_has_feature() was being called too early in boot via __put_user_size(). However since then some new code in linux-next has created a new path via which we can end up calling mmu_has_feature() too early. On P9 this leads to crashes early in boot if we have both PPC_KUAP and CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG enabled. Our early boot code calls printk() which calls probe_kernel_read(), that does a __copy_from_user_inatomic() which calls into set_kuap() and that uses mmu_has_feature(). At that point in boot we haven't patched MMU features yet so the debug code in mmu_has_feature() complains, and calls printk(). At that point we recurse, eg: ... dump_stack+0xdc probe_kernel_read+0x1a4 check_pointer+0x58 ... printk+0x40 dump_stack_print_info+0xbc dump_stack+0x8 probe_kernel_read+0x1a4 probe_kernel_read+0x19c check_pointer+0x58 ... printk+0x40 cpufeatures_process_feature+0xc8 scan_cpufeatures_subnodes+0x380 of_scan_flat_dt_subnodes+0xb4 dt_cpu_ftrs_scan_callback+0x158 of_scan_flat_dt+0xf0 dt_cpu_ftrs_scan+0x3c early_init_devtree+0x360 early_setup+0x9c And so on for infinity, symptom is a dead system. Even more fun is what happens when using the hash MMU (ie. p8 or p9 with Radix disabled), and when we don't have CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG enabled. With the debug disabled we don't check if static keys have been initialised, we just rely on the jump label. But the jump label defaults to true so we just whack the AMR even though Radix is not enabled. Clearing the AMR is fine, but after we've done the user copy we write (0b11 << 62) into AMR. When using hash that makes all pages with key zero no longer readable or writable. All kernel pages implicitly have key zero, and so all of a sudden the kernel can't read or write any of its memory. Again dead system. In the medium term we have several options for fixing this. probe_kernel_read() doesn't need to touch AMR at all, it's not doing a user access after all, but it uses __copy_from_user_inatomic() just because it's easy, we could fix that. It would also be safe to default to not writing to the AMR during early boot, until we've detected features. But it's not clear that flipping all the MMU features to static_key_false won't introduce other bugs. But for now just switch to early_mmu_has_feature() in set_kuap(), that avoids all the problems with jump labels. It adds the overhead of a global lookup and test, but that's probably trivial compared to the writes to the AMR anyway. Fixes: 890274c2dc4c ("powerpc/64s: Implement KUAP for Radix MMU") Signed-off-by: Michael Ellerman Reviewed-by: Russell Currey --- arch/powerpc/include/asm/book3s/64/kup-radix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 7679bd0c5af0d..f254de956d6ac 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -65,7 +65,7 @@ static inline void set_kuap(unsigned long value) { - if (!mmu_has_feature(MMU_FTR_RADIX_KUAP)) + if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP)) return; /*