diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index ff9cbb6312128..4306136ef329d 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -4,5 +4,4 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += set_memory.h generic-y += user.h diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 45217f21f1fe3..b1bdf83a73db9 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -136,12 +136,6 @@ static __always_inline void __flush_icache_all(void) dsb(ish); } -int set_memory_valid(unsigned long addr, int numpages, int enable); - -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); -bool kernel_page_present(struct page *page); - #include #endif /* __ASM_CACHEFLUSH_H */ diff --git a/arch/arm64/include/asm/kfence.h b/arch/arm64/include/asm/kfence.h index 6c0afeeab6356..c44bb368a810e 100644 --- a/arch/arm64/include/asm/kfence.h +++ b/arch/arm64/include/asm/kfence.h @@ -3,7 +3,7 @@ #ifndef __ASM_KFENCE_H #define __ASM_KFENCE_H -#include +#include static inline bool arch_kfence_init_pool(void) { return true; } diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h new file mode 100644 index 0000000000000..ecb6b0f449ab0 --- /dev/null +++ b/arch/arm64/include/asm/set_memory.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _ASM_ARM64_SET_MEMORY_H +#define _ASM_ARM64_SET_MEMORY_H + +#include + +bool can_set_direct_map(void); +#define can_set_direct_map can_set_direct_map + +int set_memory_valid(unsigned long addr, int numpages, int enable); + +int set_direct_map_invalid_noflush(struct page *page, int numpages); +int set_direct_map_default_noflush(struct page *page, int numpages); +bool kernel_page_present(struct page *page); + +#endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index f83a70e07df85..ce2ee8f1e3610 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -20,5 +20,6 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS #define __ARCH_WANT_SYS_CLONE3 +#define __ARCH_WANT_MEMFD_SECRET #include diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index a0b144cfaea71..0cbc50c4fa5a4 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 30c6dd02e7061..79604049fff5f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -492,7 +493,7 @@ static void __init map_mem(pgd_t *pgdp) int flags = 0; u64 i; - if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) + if (can_set_direct_map() || crash_mem_map) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -1468,8 +1469,7 @@ int arch_add_memory(int nid, u64 start, u64 size, * KFENCE requires linear map to be mapped at page granularity, so that * it is possible to protect/unprotect single pages in the KFENCE pool. */ - if (rodata_full || debug_pagealloc_enabled() || - IS_ENABLED(CONFIG_KFENCE)) + if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE)) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 92eccaf595c8e..d505172265b09 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -19,6 +19,11 @@ struct page_change_data { bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED); +bool can_set_direct_map(void) +{ + return rodata_full || debug_pagealloc_enabled(); +} + static int change_page_range(pte_t *ptep, unsigned long addr, void *data) { struct page_change_data *cdata = data; @@ -148,40 +153,42 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) __pgprot(PTE_VALID)); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(struct page *page, int numpages) { struct page_change_data data = { .set_mask = __pgprot(0), .clear_mask = __pgprot(PTE_VALID), }; + unsigned long size = PAGE_SIZE * numpages; - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return 0; return apply_to_page_range(&init_mm, (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + size, change_page_range, &data); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(struct page *page, int numpages) { struct page_change_data data = { .set_mask = __pgprot(PTE_VALID | PTE_WRITE), .clear_mask = __pgprot(PTE_RDONLY), }; + unsigned long size = PAGE_SIZE * numpages; - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return 0; return apply_to_page_range(&init_mm, (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + size, change_page_range, &data); } #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return; set_memory_valid((unsigned long)page_address(page), numpages, enable); @@ -206,7 +213,7 @@ bool kernel_page_present(struct page *page) pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); - if (!debug_pagealloc_enabled() && !rodata_full) + if (!can_set_direct_map()) return true; pgdp = pgd_offset_k(addr); diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 211eb8244a454..1aaf2720b8f6b 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -26,8 +26,8 @@ static inline void protect_kernel_text_data(void) {}; static inline int set_memory_rw_nx(unsigned long addr, int numpages) { return 0; } #endif -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); +int set_direct_map_invalid_noflush(struct page *page, int numpages); +int set_direct_map_default_noflush(struct page *page, int numpages); bool kernel_page_present(struct page *page); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 977ee6181dabf..6c316093a1e59 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -9,6 +9,7 @@ */ #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_MEMFD_SECRET #include diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 5e49e4b4a4ccc..9618181b70be4 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -156,11 +156,11 @@ int set_memory_nx(unsigned long addr, int numpages) return __set_memory(addr, numpages, __pgprot(0), __pgprot(_PAGE_EXEC)); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(struct page *page, int numpages) { int ret; unsigned long start = (unsigned long)page_address(page); - unsigned long end = start + PAGE_SIZE; + unsigned long end = start + PAGE_SIZE * numpages; struct pageattr_masks masks = { .set_mask = __pgprot(0), .clear_mask = __pgprot(_PAGE_PRESENT) @@ -173,11 +173,11 @@ int set_direct_map_invalid_noflush(struct page *page) return ret; } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(struct page *page, int numpages) { int ret; unsigned long start = (unsigned long)page_address(page); - unsigned long end = start + PAGE_SIZE; + unsigned long end = start + PAGE_SIZE * numpages; struct pageattr_masks masks = { .set_mask = PAGE_KERNEL, .clear_mask = __pgprot(0) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6ea13f4ad07e9..42d84d86f1f46 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -42,7 +42,7 @@ config FORCE_DYNAMIC_FTRACE in order to test the non static function tracing in the generic code, as other architectures still use it. But we only need to keep it around for x86_64. No need to keep it - for x86_32. For x86_32, force DYNAMIC_FTRACE. + for x86_32. For x86_32, force DYNAMIC_FTRACE. # # Arch settings # diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 02a349afaf9c3..a1578cdf6d91b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,3 +447,4 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 watch_mount sys_watch_mount +443 i386 memfd_secret sys_memfd_secret diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index d9bcc4e025889..d8ecd9df0942c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,6 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common watch_mount sys_watch_mount +443 common memfd_secret sys_memfd_secret # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4352f08bfbb54..6224cb291f6cd 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -80,8 +80,8 @@ int set_pages_wb(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); -int set_direct_map_invalid_noflush(struct page *page); -int set_direct_map_default_noflush(struct page *page); +int set_direct_map_invalid_noflush(struct page *page, int numpages); +int set_direct_map_default_noflush(struct page *page, int numpages); bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 16f878c266679..d157fd617c999 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2184,14 +2184,14 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } -int set_direct_map_invalid_noflush(struct page *page) +int set_direct_map_invalid_noflush(struct page *page, int numpages) { - return __set_pages_np(page, 1); + return __set_pages_np(page, numpages); } -int set_direct_map_default_noflush(struct page *page) +int set_direct_map_default_noflush(struct page *page, int numpages) { - return __set_pages_p(page, 1); + return __set_pages_p(page, numpages); } #ifdef CONFIG_DEBUG_PAGEALLOC diff --git a/fs/dax.c b/fs/dax.c index 26d5dcd2d69e5..0f109eb161963 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -49,9 +49,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) -/* The order of a PMD entry */ -#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) - static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -98,7 +95,7 @@ static bool dax_is_locked(void *entry) static unsigned int dax_entry_order(void *entry) { if (xa_to_value(entry) & DAX_PMD) - return PMD_ORDER; + return PMD_PAGE_ORDER; return 0; } @@ -1470,7 +1467,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; - XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_PAGE_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; @@ -1529,7 +1526,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * entry is already in the array, for instance), it will return * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); + entry = grab_mapping_entry(&xas, mapping, PMD_PAGE_ORDER); if (xa_is_internal(entry)) { result = xa_to_internal(entry); goto fallback; @@ -1695,7 +1692,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); #ifdef CONFIG_FS_DAX_PMD - else if (order == PMD_ORDER) + else if (order == PMD_PAGE_ORDER) ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); #endif else diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d827bd7f3bfe3..3febf64d1b80c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } -/* - * set_page_objcgs - associate a page with a object cgroups vector - * @page: a pointer to the page struct - * @objcgs: a pointer to the object cgroups vector - * - * Atomically associates a page with a vector of object cgroups. - */ -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | - MEMCG_DATA_OBJCGS); -} #else static inline struct obj_cgroup **page_objcgs(struct page *page) { @@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) { return NULL; } - -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return true; -} #endif static __always_inline bool memcg_stat_item_in_bytes(int idx) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 8fcdfa52eb4be..ea5c4102c23e2 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -28,6 +28,9 @@ #define USER_PGTABLES_CEILING 0UL #endif +/* Number of base pages in a second level leaf page */ +#define PMD_PAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) + /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h new file mode 100644 index 0000000000000..907a6734059c9 --- /dev/null +++ b/include/linux/secretmem.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_SECRETMEM_H +#define _LINUX_SECRETMEM_H + +#ifdef CONFIG_SECRETMEM + +bool vma_is_secretmem(struct vm_area_struct *vma); +bool page_is_secretmem(struct page *page); +bool secretmem_active(void); + +#else + +static inline bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool page_is_secretmem(struct page *page) +{ + return false; +} + +static inline bool secretmem_active(void) +{ + return false; +} + +#endif /* CONFIG_SECRETMEM */ + +#endif /* _LINUX_SECRETMEM_H */ diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index fe1aa4e54680d..7b4b6626032dd 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -15,11 +15,11 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP -static inline int set_direct_map_invalid_noflush(struct page *page) +static inline int set_direct_map_invalid_noflush(struct page *page, int numpages) { return 0; } -static inline int set_direct_map_default_noflush(struct page *page) +static inline int set_direct_map_default_noflush(struct page *page, int numpages) { return 0; } @@ -28,7 +28,19 @@ static inline bool kernel_page_present(struct page *page) { return true; } +#else /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ +/* + * Some architectures, e.g. ARM64 can disable direct map modifications at + * boot time. Let them overrive this query. + */ +#ifndef can_set_direct_map +static inline bool can_set_direct_map(void) +{ + return true; +} +#define can_set_direct_map can_set_direct_map #endif +#endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ #ifndef set_mce_nospec static inline int set_mce_nospec(unsigned long pfn, bool unmap) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e2828e03fd98d..fb040fc0f9408 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1015,6 +1015,7 @@ asmlinkage long sys_pidfd_send_signal(int pidfd, int sig, asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags); asmlinkage long sys_watch_mount(int dfd, const char __user *path, unsigned int at_flags, int watch_fd, int watch_id); +asmlinkage long sys_memfd_secret(unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index ad58f661f4aae..26125974a8a2a 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -863,9 +863,13 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_watch_mount 442 __SYSCALL(__NR_watch_mount, sys_watch_mount) +#ifdef __ARCH_WANT_MEMFD_SECRET +#define __NR_memfd_secret 443 +__SYSCALL(__NR_memfd_secret, sys_memfd_secret) +#endif #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 444 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f3956fc11de68..35687dcb1a429 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -97,5 +97,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define Z3FOLD_MAGIC 0x33 #define PPC_CMM_MAGIC 0xc7571590 +#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/kernel/fork.c b/kernel/fork.c index 37720a6d04eaa..5a4fbd75e9b22 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -932,7 +933,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) account_kernel_stack(tsk, 1); kcov_task_init(tsk); - kmap_local_fork(tsk); +// kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index da0b419141779..559acef3fddb8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "power.h" @@ -81,7 +82,9 @@ void hibernate_release(void) bool hibernation_available(void) { - return nohibernate == 0 && !security_locked_down(LOCKDOWN_HIBERNATION); + return nohibernate == 0 && + !security_locked_down(LOCKDOWN_HIBERNATION) && + !secretmem_active(); } /** diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d63560e1cf871..64b7aab9aee43 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -86,7 +86,7 @@ static inline void hibernate_restore_unprotect_page(void *page_address) {} static inline void hibernate_map_page(struct page *page) { if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { - int ret = set_direct_map_default_noflush(page); + int ret = set_direct_map_default_noflush(page, 1); if (ret) pr_warn_once("Failed to remap page\n"); @@ -99,7 +99,7 @@ static inline void hibernate_unmap_page(struct page *page) { if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { unsigned long addr = (unsigned long)page_address(page); - int ret = set_direct_map_invalid_noflush(page); + int ret = set_direct_map_invalid_noflush(page, 1); if (ret) pr_warn_once("Failed to remap page\n"); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 21b548b694556..fca7d323dde49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4094,16 +4094,16 @@ static inline void finish_lock_switch(struct rq *rq) static inline void kmap_local_sched_out(void) { #ifdef CONFIG_KMAP_LOCAL - if (unlikely(current->kmap_ctrl.idx)) - __kmap_local_sched_out(); +// if (unlikely(current->kmap_ctrl.idx)) +// __kmap_local_sched_out(); #endif } static inline void kmap_local_sched_in(void) { #ifdef CONFIG_KMAP_LOCAL - if (unlikely(current->kmap_ctrl.idx)) - __kmap_local_sched_in(); +// if (unlikely(current->kmap_ctrl.idx)) +// __kmap_local_sched_in(); #endif } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 769ad6225ab14..869aa6b5bf345 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -355,6 +355,8 @@ COND_SYSCALL(pkey_mprotect); COND_SYSCALL(pkey_alloc); COND_SYSCALL(pkey_free); +/* memfd_secret */ +COND_SYSCALL(memfd_secret); /* * Architecture specific weak syscall entries. diff --git a/mm/Kconfig b/mm/Kconfig index f730605b8dcf4..585d440a554a8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -875,4 +875,9 @@ config MAPPING_DIRTY_HELPERS config KMAP_LOCAL bool +config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + select GENERIC_ALLOCATOR + select CMA + endmenu diff --git a/mm/Makefile b/mm/Makefile index a1af02ba8f3f1..6b581f8337e83 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -121,3 +121,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_SECRETMEM) += secretmem.o diff --git a/mm/filemap.c b/mm/filemap.c index 9b608b69ab7c8..78305fe28208d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -844,7 +845,7 @@ noinline int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - if (!huge) { + if (!huge && !page_is_secretmem(page)) { error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; diff --git a/mm/gup.c b/mm/gup.c index e4c224cd9661f..3e086b073624e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -759,6 +760,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct follow_page_context ctx = { NULL }; struct page *page; + if (vma_is_secretmem(vma)) + return NULL; + page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -892,6 +896,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -2031,6 +2038,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + if (page_is_secretmem(page)) + goto pte_unmap; + head = try_grab_compound_head(page, 1, flags); if (!head) goto pte_unmap; diff --git a/mm/internal.h b/mm/internal.h index 9902648f2206f..8e9c660f33ca1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -353,6 +353,9 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern unsigned int munlock_vma_page(struct page *page); +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); + /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 605f671203efb..afaa3195ed6f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2935,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp) + gfp_t gfp, bool new_page) { unsigned int objects = objs_per_slab_page(s, page); + unsigned long memcg_data; void *vec; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, @@ -2945,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (!set_page_objcgs(page, vec)) + memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; + if (new_page) { + /* + * If the slab page is brand new and nobody can yet access + * it's memcg_data, no synchronization is required and + * memcg_data can be simply assigned. + */ + page->memcg_data = memcg_data; + } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + /* + * If the slab page is already in use, somebody can allocate + * and assign obj_cgroups in parallel. In this case the existing + * objcg vector should be reused. + */ kfree(vec); - else - kmemleak_not_leak(vec); + return 0; + } + kmemleak_not_leak(vec); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index bb62a59ed92a7..8144fc3c5a78e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1348,9 +1348,8 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -static inline int mlock_future_check(struct mm_struct *mm, - unsigned long flags, - unsigned long len) +int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len) { unsigned long locked, lock_limit; diff --git a/mm/secretmem.c b/mm/secretmem.c new file mode 100644 index 0000000000000..b8a32954ac68c --- /dev/null +++ b/mm/secretmem.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2020 + * + * Author: Mike Rapoport + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "secretmem: " fmt + +/* + * Define mode and flag masks to allow validation of the system call + * parameters. + */ +#define SECRETMEM_MODE_MASK (0x0) +#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK + +struct secretmem_ctx { + struct gen_pool *pool; + unsigned int mode; +}; + +static struct cma *secretmem_cma; + +static atomic_t secretmem_users; + +bool secretmem_active(void) +{ + return !!atomic_read(&secretmem_users); +} + +static int secretmem_account_pages(struct page *page, gfp_t gfp, int order) +{ + int err; + + err = memcg_kmem_charge_page(page, gfp, order); + if (err) + return err; + + /* + * seceremem caches are unreclaimable kernel allocations, so treat + * them as unreclaimable slab memory for VM statistics purposes + */ + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + + return 0; +} + +static void secretmem_unaccount_pages(struct page *page, int order) +{ + + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -PAGE_SIZE << order); + memcg_kmem_uncharge_page(page, order); +} + +static int secretmem_pool_increase(struct secretmem_ctx *ctx, gfp_t gfp) +{ + unsigned long nr_pages = (1 << PMD_PAGE_ORDER); + struct gen_pool *pool = ctx->pool; + unsigned long addr; + struct page *page; + int err; + + page = cma_alloc(secretmem_cma, nr_pages, PMD_SIZE, gfp & __GFP_NOWARN); + if (!page) + return -ENOMEM; + + err = secretmem_account_pages(page, gfp, PMD_PAGE_ORDER); + if (err) + goto err_cma_release; + + err = set_direct_map_invalid_noflush(page, nr_pages); + if (err) + goto err_memcg_uncharge; + + addr = (unsigned long)page_address(page); + err = gen_pool_add(pool, addr, PMD_SIZE, NUMA_NO_NODE); + if (err) + goto err_set_direct_map; + + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + + return 0; + +err_set_direct_map: + /* + * If a split of PUD-size page was required, it already happened + * when we marked the pages invalid which guarantees that this call + * won't fail + */ + set_direct_map_default_noflush(page, nr_pages); +err_memcg_uncharge: + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); +err_cma_release: + cma_release(secretmem_cma, page, nr_pages); + return err; +} + +static struct page *secretmem_alloc_page(struct secretmem_ctx *ctx, + gfp_t gfp) +{ + struct gen_pool *pool = ctx->pool; + unsigned long addr; + struct page *page; + int err; + + if (gen_pool_avail(pool) < PAGE_SIZE) { + err = secretmem_pool_increase(ctx, gfp); + if (err) + return NULL; + } + + addr = gen_pool_alloc(pool, PAGE_SIZE); + if (!addr) + return NULL; + + page = virt_to_page(addr); + get_page(page); + + return page; +} + +static vm_fault_t secretmem_fault(struct vm_fault *vmf) +{ + struct secretmem_ctx *ctx = vmf->vma->vm_file->private_data; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct inode *inode = file_inode(vmf->vma->vm_file); + pgoff_t offset = vmf->pgoff; + vm_fault_t ret = 0; + struct page *page; + int err; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return vmf_error(-EINVAL); + + page = find_get_page(mapping, offset); + if (!page) { + page = secretmem_alloc_page(ctx, vmf->gfp_mask); + if (!page) + return vmf_error(-ENOMEM); + + err = add_to_page_cache(page, mapping, offset, vmf->gfp_mask); + if (unlikely(err)) + goto err_put_page; + + __SetPageUptodate(page); + set_page_private(page, (unsigned long)ctx); + + ret = VM_FAULT_LOCKED; + } + + vmf->page = page; + return ret; + +err_put_page: + put_page(page); + return vmf_error(err); +} + +static const struct vm_operations_struct secretmem_vm_ops = { + .fault = secretmem_fault, +}; + +static int secretmem_release(struct inode *inode, struct file *file) +{ + atomic_dec(&secretmem_users); + return 0; +} + +static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long len = vma->vm_end - vma->vm_start; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + return -EINVAL; + + if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + return -EAGAIN; + + vma->vm_ops = &secretmem_vm_ops; + vma->vm_flags |= VM_LOCKED; + + return 0; +} + +bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &secretmem_vm_ops; +} + +static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +}; + +static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) +{ + return false; +} + +static int secretmem_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return -EBUSY; +} + +static void secretmem_freepage(struct page *page) +{ + unsigned long addr = (unsigned long)page_address(page); + struct secretmem_ctx *ctx = (struct secretmem_ctx *)page_private(page); + struct gen_pool *pool = ctx->pool; + + gen_pool_free(pool, addr, PAGE_SIZE); +} + +static const struct address_space_operations secretmem_aops = { + .freepage = secretmem_freepage, + .migratepage = secretmem_migratepage, + .isolate_page = secretmem_isolate_page, +}; + +bool page_is_secretmem(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (!mapping) + return false; + + return mapping->a_ops == &secretmem_aops; +} + +static struct vfsmount *secretmem_mnt; + +static struct file *secretmem_file_create(unsigned long flags) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct secretmem_ctx *ctx; + struct inode *inode; + + inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + goto err_free_inode; + + ctx->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); + if (!ctx->pool) + goto err_free_ctx; + + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", + O_RDWR, &secretmem_fops); + if (IS_ERR(file)) + goto err_free_pool; + + mapping_set_unevictable(inode->i_mapping); + + inode->i_private = ctx; + inode->i_mapping->private_data = ctx; + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ + inode->i_mode |= S_IFREG; + inode->i_size = 0; + + file->private_data = ctx; + + ctx->mode = flags & SECRETMEM_MODE_MASK; + + return file; + +err_free_pool: + gen_pool_destroy(ctx->pool); +err_free_ctx: + kfree(ctx); +err_free_inode: + iput(inode); + return file; +} + +SYSCALL_DEFINE1(memfd_secret, unsigned long, flags) +{ + struct file *file; + int fd, err; + + /* make sure local flags do not confict with global fcntl.h */ + BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); + + if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) + return -EINVAL; + + if (!secretmem_cma) + return -ENOMEM; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + file = secretmem_file_create(flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_put_fd; + } + + file->f_flags |= O_LARGEFILE; + + fd_install(fd, file); + atomic_inc(&secretmem_users); + return fd; + +err_put_fd: + put_unused_fd(fd); + return err; +} + +static void secretmem_cleanup_chunk(struct gen_pool *pool, + struct gen_pool_chunk *chunk, void *data) +{ + unsigned long start = chunk->start_addr; + unsigned long end = chunk->end_addr; + struct page *page = virt_to_page(start); + unsigned long nr_pages = (end - start + 1) / PAGE_SIZE; + int i; + + set_direct_map_default_noflush(page, nr_pages); + secretmem_unaccount_pages(page, PMD_PAGE_ORDER); + + for (i = 0; i < nr_pages; i++) + clear_highpage(page + i); + + cma_release(secretmem_cma, page, nr_pages); +} + +static void secretmem_cleanup_pool(struct secretmem_ctx *ctx) +{ + struct gen_pool *pool = ctx->pool; + + gen_pool_for_each_chunk(pool, secretmem_cleanup_chunk, ctx); + gen_pool_destroy(pool); +} + +static void secretmem_evict_inode(struct inode *inode) +{ + struct secretmem_ctx *ctx = inode->i_private; + + truncate_inode_pages_final(&inode->i_data); + secretmem_cleanup_pool(ctx); + clear_inode(inode); + kfree(ctx); +} + +static const struct super_operations secretmem_super_ops = { + .evict_inode = secretmem_evict_inode, +}; + +static int secretmem_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, SECRETMEM_MAGIC); + + if (!ctx) + return -ENOMEM; + ctx->ops = &secretmem_super_ops; + + return 0; +} + +static struct file_system_type secretmem_fs = { + .name = "secretmem", + .init_fs_context = secretmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int secretmem_init(void) +{ + int ret = 0; + + secretmem_mnt = kern_mount(&secretmem_fs); + if (IS_ERR(secretmem_mnt)) + ret = PTR_ERR(secretmem_mnt); + + return ret; +} +fs_initcall(secretmem_init); + +static int __init secretmem_setup(char *str) +{ + phys_addr_t align = PMD_SIZE; + unsigned long reserved_size; + int err; + + if (!can_set_direct_map()) + return 0; + + reserved_size = memparse(str, NULL); + if (!reserved_size) + return 0; + + if (reserved_size * 2 > PUD_SIZE) + align = PUD_SIZE; + + err = cma_declare_contiguous(0, reserved_size, 0, align, 0, false, + "secretmem", &secretmem_cma); + if (err) { + pr_err("failed to create CMA: %d\n", err); + return err; + } + + pr_info("reserved %luM\n", reserved_size >> 20); + + return 0; +} +__setup("secretmem=", secretmem_setup); diff --git a/mm/slab.c b/mm/slab.c index eb521b7971aa4..eaf7c44598f68 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1380,7 +1380,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - account_slab_page(page, cachep->gfporder, cachep); + account_slab_page(page, cachep->gfporder, cachep, flags); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) diff --git a/mm/slab.h b/mm/slab.h index 1a756a359fa8b..1e10bfd75659f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -240,7 +240,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp); + gfp_t gfp, bool new_page); static inline void memcg_free_page_obj_cgroups(struct page *page) { @@ -317,7 +317,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, page = virt_to_head_page(p[i]); if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags)) { + memcg_alloc_page_obj_cgroups(page, s, flags, + false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } @@ -381,7 +382,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) } static inline int memcg_alloc_page_obj_cgroups(struct page *page, - struct kmem_cache *s, gfp_t gfp) + struct kmem_cache *s, gfp_t gfp, + bool new_page) { return 0; } @@ -422,8 +424,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) } static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s) + struct kmem_cache *s, + gfp_t gfp) { + if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_page_obj_cgroups(page, s, gfp, true); + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), PAGE_SIZE << order); } diff --git a/mm/slub.c b/mm/slub.c index 97d3822561750..7abc5971ccacd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1625,9 +1625,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, else page = __alloc_pages_node(node, flags, order); - if (page) - account_slab_page(page, order, s); - return page; } @@ -1780,6 +1777,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); + account_slab_page(page, oo_order(oo), s, flags); + page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d88fe5a277ac..d5b262d53622e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2195,13 +2195,14 @@ struct vm_struct *remove_vm_area(const void *addr) } static inline void set_area_direct_map(const struct vm_struct *area, - int (*set_direct_map)(struct page *page)) + int (*set_direct_map)(struct page *page, + int numpages)) { int i; for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) - set_direct_map(area->pages[i]); + set_direct_map(area->pages[i], 1); } /* Handle removing and resetting vm mappings related to the vm_struct. */ diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index a18b47695f55f..b7609958ee364 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -40,6 +40,10 @@ cat << EOF #define __IGNORE_setrlimit /* setrlimit */ #endif +#ifndef __ARCH_WANT_MEMFD_SECRET +#define __IGNORE_memfd_secret +#endif + /* Missing flags argument */ #define __IGNORE_renameat /* renameat2 */ diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 9a35c3f6a5573..c8deddc81e7ae 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -21,4 +21,5 @@ va_128TBswitch map_fixed_noreplace write_to_hugetlbfs hmm-tests +memfd_secret local_config.* diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d42115e4284d7..0200fb61646c2 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -34,6 +34,7 @@ TEST_GEN_FILES += khugepaged TEST_GEN_FILES += map_fixed_noreplace TEST_GEN_FILES += map_hugetlb TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += mremap_dontunmap @@ -133,7 +134,7 @@ warn_32bit_failure: endif endif -$(OUTPUT)/mlock-random-test: LDLIBS += -lcap +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap $(OUTPUT)/gup_test: ../../../../mm/gup_test.h diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c new file mode 100644 index 0000000000000..c878c2b841fc3 --- /dev/null +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2020 + * + * Author: Mike Rapoport + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned long flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_exit(!ksft_get_fail_cnt()); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index e953f3cd9664e..95a67382f132c 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -346,4 +346,21 @@ else exitcode=1 fi +echo "running memfd_secret test" +echo "------------------------------------" +./memfd_secret +ret_val=$? + +if [ $ret_val -eq 0 ]; then + echo "[PASS]" +elif [ $ret_val -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip +else + echo "[FAIL]" + exitcode=1 +fi + +exit $exitcode + exit $exitcode