diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5aa368d165dab..69d7a6983f781 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1301,6 +1301,11 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel (npn) + Amount of total kernel memory, including + (kernel_stack, pagetables, percpu, vmalloc, slab) in + addition to other kernel memory use cases. + kernel_stack Amount of memory allocated to kernel stacks. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index cb30ca3df27c9..a748e7eb4429b 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -146,9 +146,9 @@ System kernel config options CONFIG_SYSFS=y Note that "sysfs file system support" might not appear in the "Pseudo - filesystems" menu if "Configure standard kernel features (for small - systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows:: + filesystems" menu if "Configure standard kernel features (expert users)" + is not enabled in "General Setup." In this case, check the .config file + itself to ensure that sysfs is turned on, as follows:: grep 'CONFIG_SYSFS' .config @@ -533,6 +533,10 @@ the following command:: cp /proc/vmcore +or use scp to write out the dump file between hosts on a network, e.g:: + + scp /proc/vmcore remote_username@remote_ip: + You can also use makedumpfile utility to write out the dump file with specified options to filter out unwanted contents, e.g:: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4681419b7857b..3c2b3e24e8f50 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1649,7 +1649,7 @@ [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP enabled. Allows heavy hugetlb users to free up some more - memory (6 * PAGE_SIZE for each 2MB hugetlb page). + memory (7 * PAGE_SIZE for each 2MB hugetlb page). Format: { on | off (default) } on: enable the feature @@ -3765,6 +3765,7 @@ bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer bit 5: print all printk messages in buffer + bit 6: print all CPUs backtrace (if available in the arch) panic_on_taint= Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 59b84904a8543..1e06435b8ff67 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -108,19 +108,23 @@ In such cases, users can explicitly set the initial monitoring target regions as they want, by writing proper values to the ``init_regions`` file. Each line of the input should represent one region in below form.:: - + -The ``target id`` should already in ``target_ids`` file, and the regions should -be passed in address order. For example, below commands will set a couple of -address ranges, ``1-100`` and ``100-200`` as the initial monitoring target -region of process 42, and another couple of address ranges, ``20-40`` and -``50-100`` as that of process 4242.:: +The ``target idx`` should be the index of the target in ``target_ids`` file, +starting from ``0``, and the regions should be passed in address order. For +example, below commands will set a couple of address ranges, ``1-100`` and +``100-200`` as the initial monitoring target region of pid 42, which is the +first one (index ``0``) in ``target_ids``, and another couple of address +ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one +(index ``1``) in ``target_ids``.:: # cd /damon - # echo "42 1 100 - 42 100 200 - 4242 20 40 - 4242 50 100" > init_regions + # cat target_ids + 42 4242 + # echo "0 1 100 + 0 100 200 + 1 20 40 + 1 50 100" > init_regions Note that this sets the initial monitoring target regions only. In case of virtual memory monitoring, DAMON will automatically updates the boundary of the diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 8edb8d578caf7..6e6f7b0d6562b 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -130,9 +130,25 @@ attribute, e.g.:: echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled When zswap same-filled page identification is disabled at runtime, it will stop -checking for the same-value filled pages during store operation. However, the -existing pages which are marked as same-value filled pages remain stored -unchanged in zswap until they are either loaded or invalidated. +checking for the same-value filled pages during store operation. +In other words, every page will be then considered non-same-value filled. +However, the existing pages which are marked as same-value filled pages remain +stored unchanged in zswap until they are either loaded or invalidated. + +In some circumstances it might be advantageous to make use of just the zswap +ability to efficiently store same-filled pages without enabling the whole +compressed page storage. +In this case the handling of non-same-value pages by zswap (enabled by default) +can be disabled by setting the ``non_same_filled_pages_enabled`` attribute +to 0, e.g. ``zswap.non_same_filled_pages_enabled=0``. +It can also be enabled and disabled at runtime using the sysfs +``non_same_filled_pages_enabled`` attribute, e.g.:: + + echo 1 > /sys/module/zswap/parameters/non_same_filled_pages_enabled + +Disabling both ``zswap.same_filled_pages_enabled`` and +``zswap.non_same_filled_pages_enabled`` effectively disables accepting any new +pages by zswap. To prevent zswap from shrinking pool when zswap is full and there's a high pressure on swap (this will result in flipping pages in and out zswap pool diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index d359bcfadd39a..078e1aff4adcf 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -595,16 +595,23 @@ Documentation/admin-guide/kernel-parameters.rst). numa_balancing ============== -Enables/disables automatic page fault based NUMA memory -balancing. Memory is moved automatically to nodes -that access it often. +Enables/disables and configures automatic page fault based NUMA memory +balancing. Memory is moved automatically to nodes that access it often. +The value to set can be the result of ORing the following:: -Enables/disables automatic NUMA memory balancing. On NUMA machines, there -is a performance penalty if remote memory is accessed by a CPU. When this -feature is enabled the kernel samples what task thread is accessing memory -by periodically unmapping pages and later trapping a page fault. At the -time of the page fault, it is determined if the data being accessed should -be migrated to a local memory node. += ================================= +0 NUMA_BALANCING_DISABLED +1 NUMA_BALANCING_NORMAL +2 NUMA_BALANCING_MEMORY_TIERING += ================================= + +Or NUMA_BALANCING_NORMAL to optimize page placement among different +NUMA nodes to reduce remote accessing. On NUMA machines, there is a +performance penalty if remote memory is accessed by a CPU. When this +feature is enabled the kernel samples what task thread is accessing +memory by periodically unmapping pages and later trapping a page +fault. At the time of the page fault, it is determined if the data +being accessed should be migrated to a local memory node. The unmapping of pages and trapping faults incur additional overhead that ideally is offset by improved memory locality but there is no universal @@ -615,6 +622,10 @@ faults may be controlled by the `numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls. +Or NUMA_BALANCING_MEMORY_TIERING to optimize page placement among +different types of memory (represented as different NUMA nodes) to +place the hot pages in the fast memory. This is implemented based on +unmapping and page fault too. numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb =============================================================================================================================== @@ -795,6 +806,8 @@ bit 1 print system memory info bit 2 print timer info bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer +bit 5 print all printk messages in buffer +bit 6 print all CPUs backtrace (if available in the arch) ===== ============================================ So for example to print tasks and memory info on panic, user can:: diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 8089c559d339c..7614a1fc30fac 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -30,7 +30,7 @@ Software tag-based KASAN mode is only supported in Clang. The hardware KASAN mode (#3) relies on hardware to perform the checks but still requires a compiler version that supports memory tagging instructions. -This mode is supported in GCC 10+ and Clang 11+. +This mode is supported in GCC 10+ and Clang 12+. Both software KASAN modes work with SLUB and SLAB memory allocators, while the hardware tag-based KASAN currently only supports SLUB. @@ -206,6 +206,9 @@ additional boot parameters that allow disabling KASAN or controlling features: Asymmetric mode: a bad access is detected synchronously on reads and asynchronously on writes. +- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc + allocations (default: ``on``). + - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). @@ -279,8 +282,8 @@ Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Software tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Software tag-based KASAN currently only supports tagging of slab, page_alloc, +and vmalloc memory. Hardware tag-based KASAN ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -303,8 +306,8 @@ Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently reserved to tag freed memory regions. -Hardware tag-based KASAN currently only supports tagging of slab and page_alloc -memory. +Hardware tag-based KASAN currently only supports tagging of slab, page_alloc, +and VM_ALLOC-based vmalloc memory. If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN will not be enabled. In this case, all KASAN boot parameters are ignored. @@ -319,6 +322,8 @@ checking gets disabled. Shadow memory ------------- +The contents of this section are only applicable to software KASAN modes. + The kernel maps memory in several different parts of the address space. The range of kernel virtual addresses is large: there is not enough real memory to support a real shadow region for every address that could be @@ -349,7 +354,7 @@ CONFIG_KASAN_VMALLOC With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the cost of greater memory usage. Currently, this is supported on x86, -riscv, s390, and powerpc. +arm64, riscv, s390, and powerpc. This works by hooking into vmalloc and vmap and dynamically allocating real shadow memory to back the mappings. diff --git a/Documentation/vm/balance.rst b/Documentation/vm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/vm/balance.rst +++ b/Documentation/vm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 9837fc8147dd6..2b54e82b9fe15 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -89,22 +89,41 @@ Usage Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack Page allocated via order XXX, ... PFN XXX ... - // Detailed stack + // Detailed stack The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times - and pages of buf, and finally sorts them according to the times. + and pages of buf, and finally sorts them according to the parameter(s). See the result about who allocated each page in the ``sorted_page_owner.txt``. General output: XXX times, XXX pages: Page allocated via order XXX, ... - // Detailed stack + // Detailed stack By default, ``page_owner_sort`` is sorted according to the times of buf. - If you want to sort by the pages nums of buf, use the ``-m`` parameter. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The detailed parameters are: + + fundamental function: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + + additional function: + + Cull: + -c Cull by comparing stacktrace instead of total block. + + Filter: + -f Filter out the information of blocks whose memory has not been released. diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 72150cb8db028..4d94ffacb8680 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -37,6 +37,7 @@ config ARM select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN select BINFMT_FLAT_ARGVP_ENVP_ON_STACK @@ -1514,9 +1515,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARM_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7c3081e566699..8719051b45274 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE @@ -205,7 +206,7 @@ config ARM64 select IOMMU_DMA if IOMMU_SUPPORT select IRQ_DOMAIN select IRQ_FORCED_THREADING - select KASAN_VMALLOC if KASAN_GENERIC + select KASAN_VMALLOC if KASAN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH @@ -1252,9 +1253,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_HAS_FILTER_PGPROT - def_bool y - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index b9185503feae2..38fafffe699f7 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -25,4 +25,10 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) #endif +#define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return pgprot_tagged(prot); +} + #endif /* _ASM_ARM64_VMALLOC_H */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 894e031b28d28..20873099c035c 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -17,10 +17,13 @@ */ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { + void *p; + BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + p = __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, __builtin_return_address(0)); + return kasan_reset_tag(p); } #endif /* __ASM_VMAP_STACK_H */ diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 309a27553c875..f2d4bb14bfabe 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -58,12 +58,13 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } - return p; + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(p); } enum aarch64_reloc_op { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f70573928f1bf..3505789cf4bd9 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -406,9 +406,6 @@ static int __init topology_init(void) { int i; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); cpu->hotpluggable = cpu_can_disable(i); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index ffb9c229610ab..228226c5fa809 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -347,6 +347,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; + entry = pte_mkhuge(entry); if (pagesize == CONT_PTE_SIZE) { entry = pte_mkcont(entry); } else if (pagesize == CONT_PMD_SIZE) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index db63cc885771a..3973e305adc89 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -64,7 +64,6 @@ EXPORT_SYMBOL(memstart_addr); */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -78,6 +77,9 @@ static void __init reserve_crashkernel(void) unsigned long long crash_max = arm64_dma_phys_limit; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); /* no crashkernel= or invalid value specified */ @@ -110,11 +112,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif /* CONFIG_KEXEC_CORE */ /* * Return the maximum physical address for a zone accessible by the given bits diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a3bacd79507a4..64e985eaa52d8 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -85,7 +85,7 @@ static int change_memory_common(unsigned long addr, int numpages, */ area = find_vm_area((void *)addr); if (!area || - end > (unsigned long)area->addr + area->size || + end > (unsigned long)kasan_reset_tag(area->addr) + area->size || !(area->flags & VM_ALLOC)) return -EINVAL; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 74f9a9b6a0530..9ea348149a69f 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1155,7 +1155,8 @@ u64 bpf_jit_alloc_exec_limit(void) void *bpf_jit_alloc_exec(unsigned long size) { - return vmalloc(size); + /* Memory is intended to be executable, reset the pointer tag. */ + return kasan_reset_tag(vmalloc(size)); } void bpf_jit_free_exec(void *addr) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f01e91e10d95d..3167a3b5c97b0 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -29,8 +29,6 @@ int max_kernel_seg = 0x303; /* indicate pfn's of high memory */ unsigned long highstart_pfn, highend_pfn; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* Default cache attribute for newly created page tables */ unsigned long _dflt_cache_att = CACHEDEF; diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index e4992917a24b7..94a848b06f15a 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -70,16 +70,6 @@ static int __init topology_init(void) { int i, err = 0; -#ifdef CONFIG_NUMA - /* - * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? - */ - for_each_online_node(i) { - if ((err = register_one_node(i))) - goto out; - } -#endif - sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL); if (!sysfs_cpus) panic("kzalloc in topology_init failed - NR_CPUS too big?"); diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 791d4176e4a6b..73d0db36edb60 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,17 +608,11 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -#ifdef CONFIG_MEMORY_HOTPLUG -pg_data_t *arch_alloc_nodedata(int nid) +pg_data_t * __init arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); - return kzalloc(size, GFP_KERNEL); -} - -void arch_free_nodedata(pg_data_t *pgdat) -{ - kfree(pgdat); + return memblock_alloc(size, SMP_CACHE_BYTES); } void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) @@ -626,7 +620,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) pgdat_list[update_node] = update_pgdat; scatter_node_data(); } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/arch/mips/kernel/topology.c b/arch/mips/kernel/topology.c index 08ad6371fbe08..9429d85a4703c 100644 --- a/arch/mips/kernel/topology.c +++ b/arch/mips/kernel/topology.c @@ -12,11 +12,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif /* CONFIG_NUMA */ - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index f63f839738c46..825c85cab1a1d 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -18,7 +18,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_SPINLOCK(anon_alias_lock); extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 97305bde1b169..3a021ab6f1aef 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -38,8 +38,6 @@ int mem_init_done; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static void __init zone_sizes_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 64b6c608eca43..de092b04ee1a1 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags size_t size = 1UL << shift; if (size == SZ_16K) - return __pte(pte_val(entry) & ~_PAGE_HUGE); + return __pte(pte_val(entry) | _PAGE_SPS); else - return entry; + return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE); } #define arch_make_huge_pte arch_make_huge_pte #endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9ca..d0ad86b67e665 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,6 +112,12 @@ static int __init fadump_cma_init(void) return 1; } + /* + * If CMA activation fails, keep the pages reserved, instead of + * exposing them to buddy allocator. Same as 'fadump=nocma' case. + */ + cma_reserve_pages_on_error(fadump_cma); + /* * So we now have successfully initialized cma area for fadump. */ diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index d45a415d5374b..2069bbb90a9a3 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -1110,14 +1110,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); /* NUMA stuff */ #ifdef CONFIG_NUMA -static void __init register_nodes(void) -{ - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -} - int sysfs_add_device_to_node(struct device *dev, int nid) { struct node *node = node_devices[nid]; @@ -1132,13 +1124,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid) sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); } EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); - -#else -static void __init register_nodes(void) -{ - return; -} - #endif /* Only valid if CPU is present. */ @@ -1155,8 +1140,6 @@ static int __init topology_init(void) { int cpu, r; - register_nodes(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c index b86e7b9062571..ccd64b5e6cac7 100644 --- a/arch/powerpc/mm/book3s64/trace.c +++ b/arch/powerpc/mm/book3s64/trace.c @@ -3,6 +3,5 @@ * This file is for defining trace points and trace related helpers. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define CREATE_TRACE_POINTS #include #endif diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5adcbd9b5e886..0804b9a11934d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -40,6 +40,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU @@ -171,9 +172,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index b42bfdc674823..834eb652a7b9d 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -301,9 +301,6 @@ static int __init topology_init(void) { int i, ret; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_devices, i); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c27294128e182..ff2f41b3b558d 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -957,7 +957,6 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ -#ifdef CONFIG_KEXEC_CORE /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -974,6 +973,8 @@ static void __init reserve_crashkernel(void) int ret = 0; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; /* * Don't reserve a region for a crash kernel on a crash kernel * since it doesn't make much sense and we have limited memory @@ -1023,7 +1024,6 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } -#endif /* CONFIG_KEXEC_CORE */ void __init paging_init(void) { @@ -1037,9 +1037,7 @@ void __init misc_mem_init(void) arch_numa_init(); sparse_init(); zone_sizes_init(); -#ifdef CONFIG_KEXEC_CORE reserve_crashkernel(); -#endif memblock_dump_all(); } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b032e556eeb71..a7aefc278909b 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -45,7 +45,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 51c5a9f6e5257..23ab9f02f2787 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -33,10 +33,3 @@ void __init numa_setup(void) NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } - -static int __init numa_init_late(void) -{ - register_one_node(0); - return 0; -} -arch_initcall(numa_init_late); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 76af6db9daa23..2d2a7509b565a 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,11 +46,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index 6d60d416f0dd7..f19487e4cc71e 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -244,22 +244,10 @@ static void __init check_mmu_stats(void) mmu_stats_supported = 1; } -static void register_nodes(void) -{ -#ifdef CONFIG_NUMA - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -#endif -} - static int __init topology_init(void) { int cpu, ret; - register_nodes(); - check_mmu_stats(); for_each_possible_cpu(cpu) { diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 0f49fada20938..d8e0e3c7038d0 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { pte_t pte; + entry = pte_mkhuge(entry); pte = hugepage_shift_to_tte(entry, shift); #ifdef CONFIG_SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 391c4cac8958f..1c7e772a7403c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -118,6 +118,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_NO_INSTR + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 @@ -333,9 +334,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_FILTER_PGPROT - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y @@ -347,9 +345,6 @@ config ARCH_NR_GPIO config ARCH_SUSPEND_POSSIBLE def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config AUDIT_ARCH def_bool y if X86_64 diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 95fa745e310a5..c9eb8aa3b7b8a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -78,7 +78,7 @@ void *module_alloc(unsigned long size) MODULES_END, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7a132eb794d8..af2d2dc438a20 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -391,8 +391,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) * --------- Crashkernel reservation ------------------------------ */ -#ifdef CONFIG_KEXEC_CORE - /* 16M alignment for crash kernel regions */ #define CRASH_ALIGN SZ_16M @@ -470,6 +468,9 @@ static void __init reserve_crashkernel(void) bool high = false; int ret; + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + total_mem = memblock_phys_mem_size(); /* crashkernel=XM */ @@ -535,11 +536,6 @@ static void __init reserve_crashkernel(void) crashk_res.end = crash_base + crash_size - 1; insert_resource(&iomem_resource, &crashk_res); } -#else -static void __init reserve_crashkernel(void) -{ -} -#endif static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index bd83748e2bde3..8617d1ed9d31b 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -154,11 +154,6 @@ static int __init topology_init(void) { int i; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) arch_register_cpu(i); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4ba024d5b63ae..d8cfce221275e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -31,7 +31,6 @@ * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ -#define CREATE_TRACE_POINTS #include #include "mm_internal.h" diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0c612a9116967..4e645ae1e0665 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5448,7 +5448,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bfqq = bic_to_bfqq(bic, false); if (bfqq) { bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true); + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); bic_set_bfqq(bic, bfqq, false); } diff --git a/drivers/base/init.c b/drivers/base/init.c index a9f57c22fb9e2..d8d0fe687111a 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -35,5 +35,6 @@ void __init driver_init(void) auxiliary_bus_init(); cpu_dev_init(); memory_dev_init(); + node_dev_init(); container_dev_init(); } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 365cd4a7f2397..60c38f9cf1a75 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -663,14 +663,16 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->nr_vmemmap_pages = nr_vmemmap_pages; INIT_LIST_HEAD(&mem->group_next); + ret = register_memory(mem); + if (ret) + return ret; + if (group) { mem->group = group; list_add(&mem->group_next, &group->memory_blocks); } - ret = register_memory(mem); - - return ret; + return 0; } static int add_memory_block(unsigned long base_section_nr) diff --git a/drivers/base/node.c b/drivers/base/node.c index 87acc47e89515..a133981a12fc6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -1065,26 +1065,30 @@ static const struct attribute_group *cpu_root_attr_groups[] = { }; #define NODE_CALLBACK_PRI 2 /* lower than SLAB */ -static int __init register_node_type(void) +void __init node_dev_init(void) { - int ret; + static struct notifier_block node_memory_callback_nb = { + .notifier_call = node_memory_callback, + .priority = NODE_CALLBACK_PRI, + }; + int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES); ret = subsys_system_register(&node_subsys, cpu_root_attr_groups); - if (!ret) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; - register_hotmemory_notifier(&node_memory_callback_nb); - } + if (ret) + panic("%s() failed to register subsystem: %d\n", __func__, ret); + + register_hotmemory_notifier(&node_memory_callback_nb); /* - * Note: we're not going to unregister the node class if we fail - * to register the node state class attribute files. + * Create all node devices, which will properly link the node + * to applicable memory block devices and already created cpu devices. */ - return ret; + for_each_online_node(i) { + ret = register_one_node(i); + if (ret) + panic("%s() failed to add node: %d\n", __func__, ret); + } } -postcore_initcall(register_node_type); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index acb1ad3c06035..4b55e864a0a34 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -637,9 +637,6 @@ enum { STATE_SENT, /* Do not change state/UUIDs while this is set */ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) * pending, from drbd worker context. - * If set, bdi_write_congested() returns true, - * so shrink_page_list() would not recurse into, - * and potentially deadlock on, this drbd worker. */ DISCONNECT_SENT, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index c00ae8619519e..2f608525a8790 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -910,8 +910,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - return bdi_read_congested( - device->ldev->backing_bdev->bd_disk->bdi); + return 0; case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index d986f9ee0e1f4..8430f64757233 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -656,9 +656,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.vma = vma; migrate.start = start; migrate.end = end; - migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); + if (adev->gmc.xgmi.connected_to_cpu) + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT; + else + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t); size *= npages; buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); @@ -930,7 +933,7 @@ int svm_migrate_init(struct amdgpu_device *adev) { struct kfd_dev *kfddev = adev->kfd.dev; struct dev_pagemap *pgmap; - struct resource *res; + struct resource *res = NULL; unsigned long size; void *r; @@ -945,28 +948,34 @@ int svm_migrate_init(struct amdgpu_device *adev) * should remove reserved size */ size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20); - res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); - if (IS_ERR(res)) - return -ENOMEM; + if (adev->gmc.xgmi.connected_to_cpu) { + pgmap->range.start = adev->gmc.aper_base; + pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1; + pgmap->type = MEMORY_DEVICE_COHERENT; + } else { + res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); + if (IS_ERR(res)) + return -ENOMEM; + pgmap->range.start = res->start; + pgmap->range.end = res->end; + pgmap->type = MEMORY_DEVICE_PRIVATE; + } - pgmap->type = MEMORY_DEVICE_PRIVATE; pgmap->nr_range = 1; - pgmap->range.start = res->start; - pgmap->range.end = res->end; pgmap->ops = &svm_migrate_pgmap_ops; pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev); - pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; - + pgmap->flags = 0; /* Device manager releases device-specific resources, memory region and * pgmap when driver disconnects from device. */ r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); - /* Disable SVM support capability */ pgmap->type = 0; - devm_release_mem_region(adev->dev, res->start, resource_size(res)); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) + devm_release_mem_region(adev->dev, res->start, + res->end - res->start + 1); return PTR_ERR(r); } @@ -979,3 +988,4 @@ int svm_migrate_init(struct amdgpu_device *adev) return 0; } + diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index e900e3c46903b..c5fa8b8673b6a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -676,12 +676,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 605017eb9349e..e297682e2c713 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -170,8 +170,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -399,22 +399,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) @@ -823,8 +822,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1074,12 +1073,12 @@ static int load_elf_binary(struct linux_binprm *bprm) vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1117,7 +1116,7 @@ static int load_elf_binary(struct linux_binprm *bprm) * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - if (alignment > ELF_MIN_ALIGN) { + if (interpreter || alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); @@ -1139,10 +1138,10 @@ static int load_elf_binary(struct linux_binprm *bprm) /* * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the + * (Note that first_pt_load is set to false later once the * initial mapping is performed.) */ - if (!load_addr_set) { + if (first_pt_load) { total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1159,16 +1158,25 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } + + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1204,6 +1212,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1267,8 +1276,8 @@ static int load_elf_binary(struct linux_binprm *bprm) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; diff --git a/fs/buffer.c b/fs/buffer.c index a17c386a142c7..571785c80f688 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -956,10 +956,20 @@ grow_dev_page(struct block_device *bdev, sector_t block, end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, size); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x01; +#endif goto done; } - if (!try_to_free_buffers(page)) + if (!try_to_free_buffers(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x02; +#endif goto failed; + } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x04; +#endif } /* @@ -979,6 +989,9 @@ grow_dev_page(struct block_device *bdev, sector_t block, spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x08; +#endif failed: unlock_page(page); put_page(page); @@ -1030,6 +1043,12 @@ __getblk_slow(struct block_device *bdev, sector_t block, return NULL; } +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_stamp = jiffies; + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; +#endif for (;;) { struct buffer_head *bh; int ret; @@ -1041,6 +1060,24 @@ __getblk_slow(struct block_device *bdev, sector_t block, ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; + +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + if (!time_after(jiffies, current->getblk_stamp + 3 * HZ)) + continue; + printk(KERN_ERR "%s(%u): getblk(): executed=%x bh_count=%d bh_state=%lx bdev_super_blocksize=%ld size=%u bdev_super_blocksize_bits=%d bdev_inode_blkbits=%d\n", + current->comm, current->pid, current->getblk_executed, + current->getblk_bh_count, current->getblk_bh_state, + IS_ERR_OR_NULL(bdev->bd_super) ? -1L : + bdev->bd_super->s_blocksize, size, + IS_ERR_OR_NULL(bdev->bd_super) ? -1 : + bdev->bd_super->s_blocksize_bits, + IS_ERR_OR_NULL(bdev->bd_inode) ? -1 : + bdev->bd_inode->i_blkbits); + current->getblk_executed = 0; + current->getblk_bh_count = 0; + current->getblk_bh_state = 0; + current->getblk_stamp = jiffies; +#endif } } @@ -3190,6 +3227,11 @@ EXPORT_SYMBOL(sync_dirty_buffer); */ static inline int buffer_busy(struct buffer_head *bh) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x80; + current->getblk_bh_count = atomic_read(&bh->b_count); + current->getblk_bh_state = bh->b_state; +#endif return atomic_read(&bh->b_count) | (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); } @@ -3228,11 +3270,18 @@ int try_to_free_buffers(struct page *page) int ret = 0; BUG_ON(!PageLocked(page)); - if (PageWriteback(page)) + if (PageWriteback(page)) { +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x10; +#endif return 0; + } if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(page, &buffers_to_free); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x20; +#endif goto out; } @@ -3256,6 +3305,9 @@ int try_to_free_buffers(struct page *page) if (ret) cancel_dirty_page(page); spin_unlock(&mapping->private_lock); +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + current->getblk_executed |= 0x40; +#endif out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c98e5238a1b6a..9147667f8cd55 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -57,11 +57,6 @@ * accounting is preserved. */ -#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) -#define CONGESTION_OFF_THRESH(congestion_kb) \ - (CONGESTION_ON_THRESH(congestion_kb) - \ - (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, struct folio *folio, void **_fsdata); @@ -561,10 +556,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", inode, page, page->index, page_off, len, snapc, snapc->seq); - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, @@ -621,10 +612,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ - if (atomic_long_dec_return(&fsc->writeback_count) < - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - return err; } @@ -704,12 +691,6 @@ static void writepages_finish(struct ceph_osd_request *req) BUG_ON(!page); WARN_ON(!PageUptodate(page)); - if (atomic_long_dec_return(&fsc->writeback_count) < - CONGESTION_OFF_THRESH( - fsc->mount_options->congestion_kb)) - clear_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); dout("unlocking %p\n", page); @@ -952,14 +933,6 @@ static int ceph_writepages_start(struct address_space *mapping, dout("%p will write page %p idx %lu\n", inode, page, page->index); - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) { - set_bdi_congested(inode_to_bdi(inode), - BLK_RW_ASYNC); - } - - pages[locked_pages++] = page; pvec.pages[i] = NULL; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf79f369aec68..b2f38af9fca83 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -801,8 +801,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->filp_gen = 1; fsc->have_copy_from2 = true; - atomic_long_set(&fsc->writeback_count, 0); - err = -ENOMEM; /* * The number of concurrent works can be high but they don't need diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 67f145e1ae7a3..fc58adf1d36ae 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -120,8 +120,6 @@ struct ceph_fs_client { struct ceph_mds_client *mdsc; - atomic_long_t writeback_count; - struct workqueue_struct *inode_wq; struct workqueue_struct *cap_wq; diff --git a/fs/coredump.c b/fs/coredump.c index 1c060c0a2d72f..b73817712dd25 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -980,6 +981,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1039,9 +1042,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1116,8 +1130,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); - - vma_data_size += m->dump_size; } mmap_write_unlock(mm); @@ -1127,6 +1139,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, return -EFAULT; } + for (i = 0; i < *vma_count; i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + vma_data_size += m->dump_size; + } + *vma_data_size_ptr = vma_data_size; return 0; } diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302d..40b1008fb0f79 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; @@ -1897,6 +1903,9 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; @@ -1923,6 +1932,19 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out_free; + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); @@ -1951,6 +1973,8 @@ int kernel_execve(const char *kernel_filename, } retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index df14e750e9fe3..998dd2ac80089 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode) unsigned long offset; unsigned long block; struct ext2_group_desc * gdp; - struct backing_dev_info *bdi; - - bdi = inode_to_bdi(inode); - if (bdi_rw_congested(bdi)) - return; block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 67bac2792e571..54481b1d569a0 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1505,9 +1505,9 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, if (IS_NOQUOTA(cc->inode)) return 0; ret = 0; - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + /* Wait until we can get the lock, then try again. */ + f2fs_lock_op(F2FS_I_SB(cc->inode)); + f2fs_unlock_op(F2FS_I_SB(cc->inode)); goto retry_write; } return ret; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 55263dfa3504c..45037353fee34 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3052,9 +3052,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + /* Wait until we can get the + * lock, then try again. + */ + f2fs_lock_op(F2FS_I_SB(mapping->host)); + f2fs_unlock_op(F2FS_I_SB(mapping->host)); + goto retry_write; } goto next; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 56211e201d51b..78acaac9433d5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -313,8 +313,8 @@ void f2fs_drop_inmem_pages_all(struct f2fs_sb_info *sbi, bool gc_failure) skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - cond_resched(); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); if (gc_failure) { if (++looped >= count) return; @@ -802,9 +802,10 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + if (ret) { + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); + } } while (ret && --count); if (ret) { @@ -3137,7 +3138,8 @@ static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b9c22648a6365..56b78ae38a3d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2136,8 +2136,8 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ do { sync_inodes_sb(sbi->sb); - cond_resched(); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); if (unlikely(retry < 0)) @@ -2505,8 +2505,8 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 33d54c9fbefc0..01843bc5fef18 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -893,43 +893,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); -/** - * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion (may be NULL) - * @cong_bits: mask of WB_[a]sync_congested bits to test - * - * Tests whether @inode is congested. @cong_bits is the mask of congestion - * bits to test and the return value is the mask of set bits. - * - * If cgroup writeback is enabled for @inode, the congestion state is - * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg - * associated with @inode is congested; otherwise, the root wb's congestion - * state is used. - * - * @inode is allowed to be NULL as this function is often called on - * mapping->host which is NULL for the swapper space. - */ -int inode_congested(struct inode *inode, int cong_bits) -{ - /* - * Once set, ->i_wb never becomes NULL while the inode is alive. - * Start transaction iff ->i_wb is visible. - */ - if (inode && inode_to_wb_is_valid(inode)) { - struct bdi_writeback *wb; - struct wb_lock_cookie lock_cookie = {}; - bool congested; - - wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); - congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, &lock_cookie); - return congested; - } - - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} -EXPORT_SYMBOL_GPL(inode_congested); - /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 000d2e5627e99..7cede9a3bc962 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; - struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - - /* - * Get any fuse_mount belonging to this fuse_conn; s_bdi is - * shared between all of them - */ - - if (!list_empty(&fc->mounts)) { - fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); - if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } else { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } - } spin_unlock(&fc->bg_lock); up_read(&fc->killsb); fuse_conn_put(fc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cd54a529460da..e1b4a846c90d1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fm->sb) { - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } fc->num_background--; fc->active_background--; flush_bg_queue(fc); @@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req) fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fm->sb) { - set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); - } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); queued = true; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a71f1cf894b9f..d4bd94234ef73 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -447,7 +447,8 @@ static const struct address_space_operations minix_aops = { .writepage = minix_writepage, .write_begin = minix_write_begin, .write_end = generic_write_end, - .bmap = minix_bmap + .bmap = minix_bmap, + .direct_IO = noop_direct_IO }; static const struct inode_operations minix_symlink_inode_operations = { diff --git a/fs/namei.c b/fs/namei.c index 3f1829b3ab5b7..e596aabd6dc5c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1031,7 +1031,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1040,7 +1040,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1049,7 +1049,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1058,7 +1058,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, diff --git a/fs/namespace.c b/fs/namespace.c index c6feb92209a6f..fec0f79aa2eb7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2583,6 +2583,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); @@ -2597,6 +2598,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; } } diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 7aea195ddb353..18f3ff77fd0c2 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -22,13 +22,6 @@ static struct ctl_table nfs_cb_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "nfs_congestion_kb", - .data = &nfs_congestion_kb, - .maxlen = sizeof(nfs_congestion_kb), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 987a187bd39aa..1c22ea6f23c3a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -397,33 +397,8 @@ static int wb_priority(struct writeback_control *wbc) return ret; } -/* - * NFS congestion control - */ - -int nfs_congestion_kb; - -#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) -#define NFS_CONGESTION_OFF_THRESH \ - (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) - -static void nfs_set_page_writeback(struct page *page) -{ - struct inode *inode = page_file_mapping(page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); - int ret = test_set_page_writeback(page); - - WARN_ON_ONCE(ret != 0); - - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); -} - static void nfs_end_page_writeback(struct nfs_page *req) { - struct inode *inode = page_file_mapping(req->wb_page)->host; - struct nfs_server *nfss = NFS_SERVER(inode); bool is_done; is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); @@ -432,8 +407,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) return; end_page_writeback(req->wb_page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } /* @@ -617,7 +590,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, if (IS_ERR(req)) goto out; - nfs_set_page_writeback(page); + set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); /* If there is a fatal error that covers this write, just exit */ @@ -1850,7 +1823,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; - struct nfs_server *nfss; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1891,9 +1863,6 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Latency breaker */ cond_resched(); } - nfss = NFS_SERVER(data->inode); - if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -2162,26 +2131,6 @@ int __init nfs_init_writepagecache(void) if (nfs_commit_mempool == NULL) goto out_destroy_commit_cache; - /* - * NFS congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); - if (nfs_congestion_kb > 256*1024) - nfs_congestion_kb = 256*1024; - return 0; out_destroy_commit_cache: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 4f71faacd8253..9e5dd6324ea1e 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,17 +343,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, struct bio *bio = wi->bio; int err; - if (segbuf->sb_nbio > 0 && - bdi_write_congested(segbuf->sb_super->s_bdi)) { - wait_for_completion(&segbuf->sb_bio_event); - segbuf->sb_nbio--; - if (unlikely(atomic_read(&segbuf->sb_err))) { - bio_put(bio); - err = -EIO; - goto failed; - } - } - bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; bio_set_op_attrs(bio, mode, mode_flags); @@ -366,7 +355,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, wi->start = wi->end; return 0; - failed: wi->bio = NULL; return err; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4474adb393ca8..517b71c73aa96 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fc5f780fa2355..24321c44cd42e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) { - int ret; struct ocfs2_extent_tree et; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); - ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, - clusters_to_add, mark_unwritten, - data_ac, meta_ac, reason_ret); - - return ret; + return ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); } static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2c46ff6ba4ea2..0939186f010e3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -634,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); @@ -2027,8 +2034,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -2489,6 +2500,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2609,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2775,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2793,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2806,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2870,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2894,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7f6355cbb5875..a9a0c7c37e8ed 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4250,7 +4250,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4258,7 +4258,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4304,13 +4304,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4328,6 +4330,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 85a47621e0c07..a75e2b7d67f56 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn, void *name, unsigned int namelen) { - int ret; - if (!lksb->lksb_fsdlm.sb_lvbptr) lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + sizeof(struct dlm_lksb); - ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, - flags|DLM_LKF_NODLCKWT, name, namelen, 0, - fsdlm_lock_ast_wrapper, lksb, - fsdlm_blocking_ast_wrapper); - return ret; + return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, + flags|DLM_LKF_NODLCKWT, name, namelen, 0, + fsdlm_lock_ast_wrapper, lksb, + fsdlm_blocking_ast_wrapper); } static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { - int ret; - - ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, - flags, &lksb->lksb_fsdlm, lksb); - return ret; + return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, + flags, &lksb->lksb_fsdlm, lksb); } static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd784eb0cd7c4..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_mutex. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/pipe.c b/fs/pipe.c index cc28623a67b61..71946832e33f9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -606,7 +606,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - int count, head, tail, mask; + unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: @@ -803,7 +803,7 @@ struct pipe_inode_info *alloc_pipe_info(void) if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), + pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { @@ -828,7 +828,7 @@ struct pipe_inode_info *alloc_pipe_info(void) void free_pipe_info(struct pipe_inode_info *pipe) { - int i; + unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) { @@ -846,7 +846,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) } if (pipe->tmp_page) __free_page(pipe->tmp_page); - kfree(pipe->bufs); + kvfree(pipe->bufs); kfree(pipe); } @@ -1261,8 +1261,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) if (nr_slots < n) return -EBUSY; - bufs = kcalloc(nr_slots, sizeof(*bufs), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT); if (unlikely(!bufs)) return -ENOMEM; @@ -1289,7 +1288,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) head = n; tail = 0; - kfree(pipe->bufs); + kvfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) diff --git a/fs/proc/base.c b/fs/proc/base.c index d654ce7150fdd..76bf1aa3cfe88 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1764,25 +1764,25 @@ static const char *proc_pid_get_link(struct dentry *dentry, static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char *)__get_free_page(GFP_KERNEL); + char *tmp = kmalloc(PATH_MAX, GFP_KERNEL); char *pathname; int len; if (!tmp) return -ENOMEM; - pathname = d_path(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PATH_MAX); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; - len = tmp + PAGE_SIZE - 1 - pathname; + len = tmp + PATH_MAX - 1 - pathname; if (len > buflen) len = buflen; if (copy_to_user(buffer, pathname, len)) len = -EFAULT; out: - free_page((unsigned long)tmp); + kfree(tmp); return len; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b312..913bef0d2a36c 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ static int seq_show(struct seq_file *m, void *v) return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde1..4dcbcd506cb6e 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -115,7 +115,10 @@ u64 stable_page_flags(struct page *page) * it differentiates a memory hole from a page with no flags */ if (!page) - return 1 << KPF_NOPAGE; + return BIT_ULL(KPF_NOPAGE); + + if (pfn_zone_device_reserved(page_to_pfn(page))) + return BIT_ULL(KPF_RESERVED); k = page->flags; u = 0; @@ -127,22 +130,22 @@ u64 stable_page_flags(struct page *page) * simple test in page_mapped() is not enough. */ if (!PageSlab(page) && page_mapped(page)) - u |= 1 << KPF_MMAP; + u |= BIT_ULL(KPF_MMAP); if (PageAnon(page)) - u |= 1 << KPF_ANON; + u |= BIT_ULL(KPF_ANON); if (PageKsm(page)) - u |= 1 << KPF_KSM; + u |= BIT_ULL(KPF_KSM); /* * compound pages: export both head/tail info * they together define a compound page's start/end pos and order */ if (PageHead(page)) - u |= 1 << KPF_COMPOUND_HEAD; + u |= BIT_ULL(KPF_COMPOUND_HEAD); if (PageTail(page)) - u |= 1 << KPF_COMPOUND_TAIL; + u |= BIT_ULL(KPF_COMPOUND_TAIL); if (PageHuge(page)) - u |= 1 << KPF_HUGE; + u |= BIT_ULL(KPF_HUGE); /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it @@ -153,14 +156,13 @@ u64 stable_page_flags(struct page *page) struct page *head = compound_head(page); if (PageLRU(head) || PageAnon(head)) - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_THP); else if (is_huge_zero_page(head)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; + u |= BIT_ULL(KPF_ZERO_PAGE); + u |= BIT_ULL(KPF_THP); } } else if (is_zero_pfn(page_to_pfn(page))) - u |= 1 << KPF_ZERO_PAGE; - + u |= BIT_ULL(KPF_ZERO_PAGE); /* * Caveats on high order pages: page->_refcount will only be set @@ -168,23 +170,23 @@ u64 stable_page_flags(struct page *page) * SLOB won't set PG_slab at all on compound pages. */ if (PageBuddy(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); else if (page_count(page) == 0 && is_free_buddy_page(page)) - u |= 1 << KPF_BUDDY; + u |= BIT_ULL(KPF_BUDDY); if (PageOffline(page)) - u |= 1 << KPF_OFFLINE; + u |= BIT_ULL(KPF_OFFLINE); if (PageTable(page)) - u |= 1 << KPF_PGTABLE; + u |= BIT_ULL(KPF_PGTABLE); if (page_is_idle(page)) - u |= 1 << KPF_IDLE; + u |= BIT_ULL(KPF_IDLE); u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); if (PageTail(page) && PageSlab(compound_head(page))) - u |= 1 << KPF_SLAB; + u |= BIT_ULL(KPF_SLAB); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); @@ -197,7 +199,7 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); if (PageSwapCache(page)) - u |= 1 << KPF_SWAPCACHE; + u |= BIT_ULL(KPF_SWAPCACHE); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 18f8c3acbb85e..6e97ed775074c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, } static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty, bool locked) + bool compound, bool young, bool dirty, bool locked, + bool migration) { int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). + * + * The page_mapcount() is called to get a snapshot of the mapcount. + * Without holding the page lock this snapshot can be slightly wrong as + * we cannot always read the mapcount atomically. It is not safe to + * call page_mapcount() even with PTL held if the page is not mapped, + * especially for migration entries. Treat regular migration entries + * as mapcount == 1. */ - if (page_count(page) == 1) { + if ((page_count(page) == 1) || migration) { smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, locked, true); return; @@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); @@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) + } else if (is_pfn_swap_entry(swpent)) { + if (is_migration_entry(swpent)) + migration = true; page = pfn_swap_entry_to_page(swpent); + } } else { smaps_pte_hole_lookup(addr, walk); return; @@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), + locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -557,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool migration = false; if (pmd_present(*pmd)) { /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -564,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) + if (is_migration_entry(entry)) { + migration = true; page = pfn_swap_entry_to_page(entry); + } } if (IS_ERR_OR_NULL(page)) return; @@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else mss->file_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); + + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), + locked, migration); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; + bool migration = false; if (pte_present(pte)) { if (pm->show_pfn) @@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; + migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) flags |= PM_FILE; - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1421,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool migration = false; + ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { u64 flags = 0, frame = 0; @@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); + migration = is_migration_entry(entry); page = pfn_swap_entry_to_page(entry); } #endif - if (page && page_mapcount(page) == 1) + if (page && !migration && page_mapcount(page) == 1) flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 702754dd1daff..6f1b8ddc6f7a4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -static DECLARE_RWSEM(vmcore_cb_rwsem); +static DEFINE_SPINLOCK(vmcore_cb_lock); +DEFINE_STATIC_SRCU(vmcore_cb_srcu); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); /* Whether the vmcore has been opened once. */ @@ -70,8 +71,8 @@ static bool vmcore_opened; void register_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); INIT_LIST_HEAD(&cb->next); + spin_lock(&vmcore_cb_lock); list_add_tail(&cb->next, &vmcore_cb_list); /* * Registering a vmcore callback after the vmcore was opened is @@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback registration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); } EXPORT_SYMBOL_GPL(register_vmcore_cb); void unregister_vmcore_cb(struct vmcore_cb *cb) { - down_write(&vmcore_cb_rwsem); - list_del(&cb->next); + spin_lock(&vmcore_cb_lock); + list_del_rcu(&cb->next); /* * Unregistering a vmcore callback after the vmcore was opened is * very unusual (e.g., forced driver removal), but we cannot stop @@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) */ if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - up_write(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); + + synchronize_srcu(&vmcore_cb_srcu); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn) struct vmcore_cb *cb; bool ret = true; - lockdep_assert_held_read(&vmcore_cb_rwsem); - - list_for_each_entry(cb, &vmcore_cb_list, next) { + list_for_each_entry_srcu(cb, &vmcore_cb_list, next, + srcu_read_lock_held(&vmcore_cb_srcu)) { if (unlikely(!cb->pfn_is_ram)) continue; ret = cb->pfn_is_ram(cb, pfn); @@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn) static int open_vmcore(struct inode *inode, struct file *file) { - down_read(&vmcore_cb_rwsem); + spin_lock(&vmcore_cb_lock); vmcore_opened = true; - up_read(&vmcore_cb_rwsem); + spin_unlock(&vmcore_cb_lock); return 0; } @@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, unsigned long pfn, offset; size_t nr_bytes; ssize_t read = 0, tmp; + int idx; if (!count) return 0; @@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset, userbuf); } if (tmp < 0) { - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return tmp; } @@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count, ++pfn; offset = 0; } while (count); + srcu_read_unlock(&vmcore_cb_srcu, idx); - up_read(&vmcore_cb_rwsem); return read; } @@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = { /** * vmcore_alloc_buf - allocate buffer in vmalloc memory - * @sizez: size of buffer + * @size: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { - int ret; + int ret, idx; /* - * Check if oldmem_pfn_is_ram was registered to avoid - * looping over all pages without a reason. + * Check if a callback was registered to avoid looping over all + * pages without a reason. */ - down_read(&vmcore_cb_rwsem); + idx = srcu_read_lock(&vmcore_cb_srcu); if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); - up_read(&vmcore_cb_rwsem); + srcu_read_unlock(&vmcore_cb_srcu, idx); return ret; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ae87fd95b17e2..44795018df1d8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -843,9 +843,6 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) - return; - xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 993c5628a7263..e863c88df95f9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -207,14 +207,6 @@ struct backing_dev_info { #endif }; -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); - struct wb_lock_cookie { bool locked; unsigned long flags; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 483979c1b9f43..87ce24d238f34 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -135,13 +135,6 @@ static inline bool writeback_in_progress(struct bdi_writeback *wb) struct backing_dev_info *inode_to_bdi(struct inode *inode); -static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) -{ - return wb->congested & cong_bits; -} - -long congestion_wait(int sync, long timeout); - static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; @@ -162,7 +155,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); -int inode_congested(struct inode *inode, int cong_bits); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode @@ -390,50 +382,8 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) { } -static inline int inode_congested(struct inode *inode, int cong_bits) -{ - return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); -} - #endif /* CONFIG_CGROUP_WRITEBACK */ -static inline int inode_read_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_sync_congested); -} - -static inline int inode_write_congested(struct inode *inode) -{ - return inode_congested(inode, 1 << WB_async_congested); -} - -static inline int inode_rw_congested(struct inode *inode) -{ - return inode_congested(inode, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - -static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) -{ - return wb_congested(&bdi->wb, cong_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} - -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); -} - const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 338aa27e4773b..edb7f6d41faa0 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -80,12 +80,6 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) #ifdef CONFIG_BALLOON_COMPACTION extern const struct address_space_operations balloon_aops; -extern bool balloon_page_isolate(struct page *page, - isolate_mode_t mode); -extern void balloon_page_putback(struct page *page); -extern int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, - struct page *page, enum migrate_mode mode); /* * balloon_page_insert - insert a page into the balloon's page list and make @@ -155,22 +149,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool balloon_page_isolate(struct page *page) -{ - return false; -} - -static inline void balloon_page_putback(struct page *page) -{ - return; -} - -static inline int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - return 0; -} - static inline gfp_t balloon_mapping_gfp_mask(void) { return GFP_HIGHUSER; diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 6093fa6db2600..c9be1657f03d9 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -19,6 +19,9 @@ * * Example: * + * #include + * #include + * * #define REG_FIELD_A GENMASK(6, 0) * #define REG_FIELD_B BIT(7) * #define REG_FIELD_C GENMASK(15, 8) diff --git a/include/linux/cma.h b/include/linux/cma.h index bd801023504b2..51d540eee18ac 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -50,4 +50,6 @@ extern bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); + +extern void cma_reserve_pages_on_error(struct cma *cma); #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3f31ff400432b..9102ea2e4f8fe 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -140,8 +140,6 @@ struct ftrace_likely_data { */ #define __naked __attribute__((__naked__)) notrace -#define __compiler_offsetof(a, b) __builtin_offsetof(a, b) - /* * Prefer gnu_inline, so that extern inline functions do not emit an * externally visible function. This makes extern inline behave as per gnu89 diff --git a/include/linux/damon.h b/include/linux/damon.h index 5e1e3a128b77a..7c1d915b35875 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -60,19 +60,18 @@ struct damon_region { /** * struct damon_target - Represents a monitoring target. - * @id: Unique identifier for this target. + * @pid: The PID of the virtual address space to monitor. * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The - * @id of each target should be unique among the targets of the context. For - * example, in the virtual address monitoring context, it could be a pidfd or - * an address of an mm_struct. + * @pid should be set for appropriate address space monitoring primitives + * including the virtual address spaces monitoring primitives. */ struct damon_target { - unsigned long id; + struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; @@ -475,7 +474,7 @@ struct damos *damon_new_scheme( void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); -struct damon_target *damon_new_target(unsigned long id); +struct damon_target *damon_new_target(void); void damon_add_target(struct damon_ctx *ctx, struct damon_target *t); bool damon_targets_empty(struct damon_ctx *ctx); void damon_free_target(struct damon_target *t); @@ -484,8 +483,6 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids); int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, unsigned long aggr_int, unsigned long primitive_upd_int, unsigned long min_nr_reg, unsigned long max_nr_reg); diff --git a/include/linux/fs.h b/include/linux/fs.h index ce5aabb609cd2..7c128f303b3d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1435,6 +1435,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80f63c862be57..6eef3e4475401 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -54,9 +54,17 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #define ___GFP_ZEROTAGS 0x800000u -#define ___GFP_SKIP_KASAN_POISON 0x1000000u +#ifdef CONFIG_KASAN_HW_TAGS +#define ___GFP_SKIP_ZERO 0x1000000u +#define ___GFP_SKIP_KASAN_UNPOISON 0x2000000u +#define ___GFP_SKIP_KASAN_POISON 0x4000000u +#else +#define ___GFP_SKIP_ZERO 0 +#define ___GFP_SKIP_KASAN_UNPOISON 0 +#define ___GFP_SKIP_KASAN_POISON 0 +#endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x8000000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -116,11 +124,8 @@ struct vm_area_struct; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +140,6 @@ struct vm_area_struct; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -232,24 +236,33 @@ struct vm_area_struct; * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS returns a page with zeroed memory tags on success, if - * __GFP_ZERO is set. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performace impact. + * + * %__GFP_SKIP_KASAN_UNPOISON makes KASAN skip unpoisoning on page allocation. + * Only effective in HW_TAGS mode. * - * %__GFP_SKIP_KASAN_POISON returns a page which does not need to be poisoned - * on deallocation. Typically used for userspace pages. Currently only has an - * effect in HW tags mode. + * %__GFP_SKIP_KASAN_POISON makes KASAN skip poisoning on page deallocation. + * Typically, used for userspace pages. Only effective in HW_TAGS mode. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) -#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) +#define __GFP_SKIP_KASAN_UNPOISON ((__force gfp_t)___GFP_SKIP_KASAN_UNPOISON) +#define __GFP_SKIP_KASAN_POISON ((__force gfp_t)___GFP_SKIP_KASAN_POISON) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (24 + \ + 3 * IS_ENABLED(CONFIG_KASAN_HW_TAGS) + \ + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** @@ -322,7 +335,7 @@ struct vm_area_struct; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 0a0b2b09b1b8d..a77be56302094 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -246,6 +246,16 @@ do { \ __kunmap_atomic(__addr); \ } while (0) +/** + * kunmap_local - Unmap a page mapped via kmap_local_page(). + * @__addr: An address within the page mapped + * + * @__addr can be any address within the mapped page. Commonly it is the + * address return from kmap_local_page(), but it can also include offsets. + * + * Unmapping should be done in the reverse order of the mapping. See + * kmap_local_page() for details. + */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1897a69c5406..08357b4c7be73 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -754,7 +754,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { - return entry; + return pte_mkhuge(entry); } #endif @@ -1075,12 +1075,6 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr } #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; -#else -#define hugetlb_free_vmemmap_enabled false -#endif - static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 5fb17dd4b6fad..ebfeb6099c283 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -163,6 +163,33 @@ static inline bool kallsyms_show_value(const struct cred *cred) return false; } +#ifdef CONFIG_MODULES +static inline int fill_minimal_module_info(char *sym, int size, unsigned long value) +{ + struct module *mod; + unsigned long offset; + int ret = 0; + + preempt_disable(); + mod = __module_address(value); + if (mod) { + offset = value - (unsigned long)mod->core_layout.base; + snprintf(sym, size - 1, "0x%lx+0x%lx [%s]", + (unsigned long)mod->core_layout.base, offset, mod->name); + + sym[size - 1] = '\0'; + ret = 1; + } + + preempt_enable(); + return ret; +} +#else +static inline int fill_minimal_module_info(char *sym, int size, unsigned long value) +{ + return 0; +} +#endif /*CONFIG_MODULES*/ #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4a45562d88937..3593c95d1fa54 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -25,6 +25,13 @@ struct kunit_kasan_expectation { #endif +typedef unsigned int __bitwise kasan_vmalloc_flags_t; + +#define KASAN_VMALLOC_NONE 0x00u +#define KASAN_VMALLOC_INIT 0x01u +#define KASAN_VMALLOC_VM_ALLOC 0x02u +#define KASAN_VMALLOC_PROT_NORMAL 0x04u + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -95,9 +102,6 @@ static inline bool kasan_hw_tags_enabled(void) return kasan_enabled(); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags); -void kasan_free_pages(struct page *page, unsigned int order); - #else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_enabled(void) @@ -110,20 +114,6 @@ static inline bool kasan_hw_tags_enabled(void) return false; } -static __always_inline void kasan_alloc_pages(struct page *page, - unsigned int order, gfp_t flags) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - -static __always_inline void kasan_free_pages(struct page *page, - unsigned int order) -{ - /* Only available for integrated init. */ - BUILD_BUG(); -} - #endif /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_has_integrated_init(void) @@ -435,34 +425,71 @@ static inline void kasan_init_hw_tags(void) { } #ifdef CONFIG_KASAN_VMALLOC +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + +void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); -void kasan_poison_vmalloc(const void *start, unsigned long size); -void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); -void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ } +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) +{ + return 0; +} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) { } + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags); +static __always_inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size, + kasan_vmalloc_flags_t flags) +{ + if (kasan_enabled()) + return __kasan_unpoison_vmalloc(start, size, flags); + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size); +static __always_inline void kasan_poison_vmalloc(const void *start, + unsigned long size) +{ + if (kasan_enabled()) + __kasan_poison_vmalloc(start, size); +} #else /* CONFIG_KASAN_VMALLOC */ +static inline void kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, unsigned long size) { return 0; } - -static inline void kasan_poison_vmalloc(const void *start, unsigned long size) -{ } -static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) {} + unsigned long free_region_end) { } -static inline void kasan_populate_early_vm_area_shadow(void *start, - unsigned long size) +static inline void *kasan_unpoison_vmalloc(const void *start, + unsigned long size, + kasan_vmalloc_flags_t flags) +{ + return (void *)start; +} +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } #endif /* CONFIG_KASAN_VMALLOC */ @@ -471,17 +498,17 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, !defined(CONFIG_KASAN_VMALLOC) /* - * These functions provide a special case to support backing module - * allocations with real shadow memory. With KASAN vmalloc, the special - * case is unnecessary, as the work is handled in the generic case. + * These functions allocate and free shadow memory for kernel modules. + * They are only required when KASAN_VMALLOC is not supported, as otherwise + * shadow memory is allocated by the generic vmalloc handlers. */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); -void kasan_free_shadow(const struct vm_struct *vm); +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask); +void kasan_free_module_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } -static inline void kasan_free_shadow(const struct vm_struct *vm) {} +static inline int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { return 0; } +static inline void kasan_free_module_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0c994ae37729e..58d1b58a971e3 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -20,6 +20,12 @@ #include +/* Location of a reserved region to hold the crash kernel. + */ +extern struct resource crashk_res; +extern struct resource crashk_low_res; +extern note_buf_t __percpu *crash_notes; + #ifdef CONFIG_KEXEC_CORE #include #include @@ -350,12 +356,6 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -/* Location of a reserved region to hold the crash kernel. - */ -extern struct resource crashk_res; -extern struct resource crashk_low_res; -extern note_buf_t __percpu *crash_notes; - /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c7..f49e64222628a 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -17,6 +17,8 @@ #include #include +extern unsigned long kfence_sample_interval; + /* * We allocate an even number of pages, as it simplifies calculations to map * address to metadata indices; effectively, the very first page serves as an diff --git a/include/linux/log2.h b/include/linux/log2.h index df0b155c21417..9f30d087a1281 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -18,7 +18,7 @@ * - the arch is not required to handle n==0 if implementing the fallback */ #ifndef CONFIG_ARCH_HAS_ILOG2_U32 -static inline __attribute__((const)) +static __always_inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; @@ -26,7 +26,7 @@ int __ilog2_u32(u32 n) #endif #ifndef CONFIG_ARCH_HAS_ILOG2_U64 -static inline __attribute__((const)) +static __always_inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b72d75141e125..ef4b445392a9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -34,6 +34,7 @@ enum memcg_stat_item { MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_VMALLOC, + MEMCG_KMEM, MEMCG_NR_STAT, }; @@ -219,7 +220,7 @@ struct obj_cgroup { struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { - struct list_head list; + struct list_head list; /* protected by objcg_lock */ struct rcu_head rcu; }; }; @@ -315,7 +316,8 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; - struct list_head objcg_list; /* list of inherited objcgs */ + /* list of inherited objcgs, protected by objcg_lock */ + struct list_head objcg_list; #endif MEMCG_PADDING(_pad2_); @@ -840,9 +842,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); + return mem_cgroup_from_css(memcg->css.parent); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index be48e003a5183..76bf2de86defc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,6 +16,62 @@ struct memory_group; struct resource; struct vmem_altmap; +#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION +/* + * For supporting node-hotadd, we have to allocate a new pgdat. + * + * If an arch has generic style NODE_DATA(), + * node_data[nid] = kzalloc() works well. But it depends on the architecture. + * + * In general, generic_alloc_nodedata() is used. + * + */ +extern pg_data_t *arch_alloc_nodedata(int nid); +extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); + +#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + +#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) + +#ifdef CONFIG_NUMA +/* + * XXX: node aware allocation can't work well to get new node's memory at this time. + * Because, pgdat for the new node is not allocated/initialized yet itself. + * To use new node's memory, more consideration will be necessary. + */ +#define generic_alloc_nodedata(nid) \ +({ \ + memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ +}) +/* + * This definition is just for error path in node hotadd. + * For node hotremove, we have to replace this. + */ +#define generic_free_nodedata(pgdat) kfree(pgdat) + +extern pg_data_t *node_data[]; +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ + node_data[nid] = pgdat; +} + +#else /* !CONFIG_NUMA */ + +/* never called */ +static inline pg_data_t *generic_alloc_nodedata(int nid) +{ + BUG(); + return NULL; +} +static inline void generic_free_nodedata(pg_data_t *pgdat) +{ +} +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); @@ -154,66 +210,6 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) - -#ifdef CONFIG_NUMA -/* - * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat. - * XXX: kmalloc_node() can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ -}) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - void get_online_mems(void); void put_online_mems(void); @@ -323,7 +319,7 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #ifdef CONFIG_MEMORY_HOTPLUG -extern void __ref free_area_init_core_hotplug(int nid); +extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory_resource(int nid, struct resource *resource, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d7..44383ab8af554 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -6,6 +6,7 @@ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 +#include #include #include #include @@ -42,7 +43,7 @@ struct mm_struct; * to 1, representing the caller of mpol_dup(). */ struct mempolicy { - atomic_t refcnt; + refcount_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ @@ -94,7 +95,7 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) static inline void mpol_get(struct mempolicy *pol) { if (pol) - atomic_inc(&pol->refcnt); + refcount_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1fafcc38acbad..b6032560ec2db 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,6 +39,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -59,6 +66,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -130,6 +138,7 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) } #ifdef CONFIG_ZONE_DEVICE +bool pfn_zone_device_reserved(unsigned long pfn); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -142,6 +151,11 @@ unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else +static inline bool pfn_zone_device_reserved(unsigned long pfn) +{ + return false; +} + static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { diff --git a/include/linux/migrate.h b/include/linux/migrate.h index db96e10eb8da2..66a34eae8cb63 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -130,6 +130,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b1922..bf60d947503ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -820,16 +820,11 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page) -{ - return page_mapcount(page); -} #endif static inline struct page *virt_to_head_page(const void *x) @@ -1106,6 +1101,7 @@ static inline bool page_is_devmap_managed(struct page *page) return false; switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: case MEMORY_DEVICE_FS_DAX: return true; default: @@ -1135,6 +1131,21 @@ static inline bool is_device_private_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PRIVATE; } +static inline bool is_device_coherent_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool is_dev_private_or_coherent_page(const struct page *page) +{ + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + is_zone_device_page(page) && + (page->pgmap->type == MEMORY_DEVICE_PRIVATE || + page->pgmap->type == MEMORY_DEVICE_COHERENT); +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && @@ -1916,10 +1927,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, @@ -2925,13 +2932,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO * and return without waiting upon it */ -#define FOLL_POPULATE 0x40 /* fault in pages (with FOLL_MLOCK) */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_MLOCK 0x1000 /* lock present pages */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ @@ -3151,10 +3156,12 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse); int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, gfp_t gfp_mask); +#endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b725839dfe715..884d6f6af05bb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -99,7 +99,8 @@ void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); - list_add(&folio->lru, &lruvec->lists[lru]); + if (lru != LRU_UNEVICTABLE) + list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list(struct page *page, @@ -115,6 +116,7 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); + /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } @@ -127,8 +129,11 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { - list_del(&folio->lru); - update_lru_size(lruvec, folio_lru_list(folio), folio_zonenum(folio), + enum lru_list lru = folio_lru_list(folio); + + if (lru != LRU_UNEVICTABLE) + list_del(&folio->lru); + update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5140e5feb4866..475bdb2827697 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -85,7 +85,16 @@ struct page { * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ - struct list_head lru; + union { + struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ + struct { + /* Always even, to negate PageTail */ + void *__filler; + /* Count page's or folio's mlocks */ + unsigned int mlock_count; + }; + }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -241,7 +250,13 @@ struct folio { struct { /* public: */ unsigned long flags; - struct list_head lru; + union { + struct list_head lru; + struct { + void *__filler; + unsigned int mlock_count; + }; + }; struct address_space *mapping; pgoff_t index; void *private; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aed44e9b5d899..3fff6deca2c08 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,6 +83,17 @@ static inline bool is_migrate_movable(int mt) return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } +/* + * Check whether a migratetype can be merged with another migratetype. + * + * It is only mergeable when it can fall back to other migratetypes for + * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c. + */ +static inline bool migratetype_is_mergeable(int mt) +{ + return mt < MIGRATE_PCPTYPES; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) @@ -210,6 +221,9 @@ enum node_stat_item { NR_PAGETABLE, /* used for pagetables */ #ifdef CONFIG_SWAP NR_SWAPCACHE, +#endif +#ifdef CONFIG_NUMA_BALANCING + PGPROMOTE_SUCCESS, /* promote successfully */ #endif NR_VM_NODE_STAT_ITEMS }; @@ -920,12 +934,6 @@ typedef struct pglist_data { #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) -#ifdef CONFIG_FLATMEM -#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) -#else -#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) -#endif -#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) @@ -1101,7 +1109,6 @@ static inline struct pglist_data *NODE_DATA(int nid) { return &contig_page_data; } -#define NODE_MEM_MAP(nid) mem_map #else /* CONFIG_NUMA */ @@ -1390,11 +1397,9 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms) static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) + if (!*mem_section || !mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; #endif - if (!mem_section[SECTION_NR_TO_ROOT(nr)]) - return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 68f81d8d36def..4829e6869f2ad 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -570,7 +570,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); /* * linux/fs/nfs/write.c */ -extern int nfs_congestion_kb; extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ca0959e51e817..3444ebbc63b6c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -137,7 +137,6 @@ struct nfs_server { struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ - atomic_long_t writeback; /* number of writeback pages */ unsigned int flags; /* various flags */ /* The following are for internal use only. Also see uapi/linux/nfs_mount.h */ diff --git a/include/linux/node.h b/include/linux/node.h index bb21fd631b162..7f876d48af11f 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -112,6 +112,7 @@ static inline void link_mem_sections(int nid, unsigned long start_pfn, extern void unregister_node(struct node *node); #ifdef CONFIG_NUMA +extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ extern int __register_one_node(int nid); @@ -149,6 +150,9 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister, node_registration_func_t unregister); #endif #else +static inline void node_dev_init(void) +{ +} static inline int __register_one_node(int nid) { return 0; @@ -181,4 +185,9 @@ static inline void register_hugetlbfs_with_node(node_registration_func_t reg, #define to_node(device) container_of(device, struct node, dev) +static inline bool node_is_toptier(int node) +{ + return node_state(node, N_CPU); +} + #endif /* _LINUX_NODE_H_ */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1c3b6e5c8bfd3..340cb81565683 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,13 +190,81 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); + +static __always_inline bool hugetlb_free_vmemmap_enabled(void) +{ + return static_branch_maybe(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + &hugetlb_free_vmemmap_enabled_key); +} + +/* + * If the feature of freeing some vmemmap pages associated with each HugeTLB + * page is enabled, the head vmemmap page frame is reused and all of the tail + * vmemmap addresses map to the head vmemmap page frame (furture details can + * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other + * words, there are more than one page struct with PG_head associated with each + * HugeTLB page. We __know__ that there is only one head page struct, the tail + * page structs with PG_head are fake head page structs. We need an approach + * to distinguish between those two different types of page structs so that + * compound_head() can return the real head page struct when the parameter is + * the tail page struct but with PG_head. + * + * The page_fixed_fake_head() returns the real head page struct if the @page is + * fake page head, otherwise, returns @page which can either be a true page + * head or tail. + */ +static __always_inline const struct page *page_fixed_fake_head(const struct page *page) +{ + if (!hugetlb_free_vmemmap_enabled()) + return page; + + /* + * Only addresses aligned with PAGE_SIZE of struct page may be fake head + * struct page. The alignment check aims to avoid access the fields ( + * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) + * cold cacheline in some cases. + */ + if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && + test_bit(PG_head, &page->flags)) { + /* + * We can safely access the field of the @page[1] with PG_head + * because the @page is a compound page composed with at least + * two contiguous pages. + */ + unsigned long head = READ_ONCE(page[1].compound_head); + + if (likely(head & 1)) + return (const struct page *)(head - 1); + } + return page; +} +#else +static inline const struct page *page_fixed_fake_head(const struct page *page) +{ + return page; +} + +static inline bool hugetlb_free_vmemmap_enabled(void) +{ + return false; +} +#endif + +static __always_inline int page_is_fake_head(struct page *page) +{ + return page_fixed_fake_head(page) != page; +} + static inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; - return (unsigned long)page; + return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) @@ -231,12 +299,13 @@ static inline unsigned long _compound_head(const struct page *page) static __always_inline int PageTail(struct page *page) { - return READ_ONCE(page->compound_head) & 1; + return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(struct page *page) { - return test_bit(PG_head, &page->flags) || PageTail(page); + return test_bit(PG_head, &page->flags) || + READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l @@ -695,7 +764,20 @@ static inline bool test_set_page_writeback(struct page *page) return set_page_writeback(page); } -__PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) +static __always_inline bool folio_test_head(struct folio *folio) +{ + return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); +} + +static __always_inline int PageHead(struct page *page) +{ + PF_POISONED_CHECK(page); + return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); +} + +__SETPAGEFLAG(Head, head, PF_ANY) +__CLEARPAGEFLAG(Head, head, PF_ANY) +CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 270bf5136c34e..dc31eb981ea2b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -594,13 +594,6 @@ static inline struct page *find_subpage(struct page *head, pgoff_t index) unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); -static inline unsigned find_get_pages(struct address_space *mapping, - pgoff_t *start, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, - pages); -} unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e704b1a4c06c0..73cce292d32c0 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -12,6 +12,8 @@ #include #include +#include + /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: @@ -36,7 +38,7 @@ struct anon_vma { * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ - atomic_t refcount; + refcount_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. @@ -100,14 +102,14 @@ enum ttu_flags { #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { - atomic_inc(&anon_vma->refcount); + refcount_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { - if (atomic_dec_and_test(&anon_vma->refcount)) + if (refcount_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } @@ -167,18 +169,19 @@ struct anon_vma *page_get_anon_vma(struct page *page); */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); + unsigned long address, bool compound); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, int); + unsigned long address, int flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long, bool); -void page_add_file_rmap(struct page *, bool); -void page_remove_rmap(struct page *, bool); - + unsigned long address, bool compound); +void page_add_file_rmap(struct page *, struct vm_area_struct *, + bool compound); +void page_remove_rmap(struct page *, struct vm_area_struct *, + bool compound); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long); + unsigned long address); static inline void page_dup_rmap(struct page *page, bool compound) { @@ -237,12 +240,6 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int folio_mkclean(struct folio *); -/* - * called in munlock()/munmap() path to check for other vmas holding - * the page mlocked. - */ -void page_mlock(struct page *page); - void remove_migration_ptes(struct page *old, struct page *new, bool locked); /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248b..d7d232e7e654d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1065,6 +1065,7 @@ struct task_struct { #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; + unsigned long killed_time; #endif /* Filesystem information: */ struct fs_struct *fs; @@ -1492,6 +1493,13 @@ struct task_struct { struct callback_head l1d_flush_kill; #endif +#ifdef CONFIG_DEBUG_AID_FOR_SYZBOT + unsigned long getblk_stamp; + unsigned int getblk_executed; + unsigned int getblk_bh_count; + unsigned long getblk_bh_state; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3f2b70f8d32ce..c1076b5e17fb1 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -23,6 +23,16 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_END, }; +#define NUMA_BALANCING_DISABLED 0x0 +#define NUMA_BALANCING_NORMAL 0x1 +#define NUMA_BALANCING_MEMORY_TIERING 0x2 + +#ifdef CONFIG_NUMA_BALANCING +extern int sysctl_numa_balancing_mode; +#else +#define sysctl_numa_balancing_mode 0 +#endif + /* * control realtime throttling: * diff --git a/include/linux/stddef.h b/include/linux/stddef.h index ca507bd5f8082..929d67710cc51 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -13,11 +13,7 @@ enum { }; #undef offsetof -#ifdef __compiler_offsetof -#define offsetof(TYPE, MEMBER) __compiler_offsetof(TYPE, MEMBER) -#else -#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) -#endif +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) /** * sizeof_field() - Report the size of a struct field in bytes diff --git a/include/linux/swap.h b/include/linux/swap.h index 1d38d9475c4d0..b546e4bd5c5a2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,9 +679,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page) == 1) - static inline int try_to_free_swap(struct page *page) { return 0; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 73a6f34b3847a..1087d1e2be5c4 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -209,10 +209,7 @@ __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); -static inline void copy_overflow(int size, unsigned long count) -{ - WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); -} +void copy_overflow(int size, unsigned long count); static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index ac0394087f7d4..bca27b4e5eb2d 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -401,8 +401,6 @@ static inline void user_access_restore(unsigned long flags) { } #endif #ifdef CONFIG_HARDENED_USERCOPY -void usercopy_warn(const char *name, const char *detail, bool to_user, - unsigned long offset, unsigned long len); void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 7b2363388bfa2..16a0a4fd000be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,6 +129,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#ifdef CONFIG_KSM + KSM_SWPIN_COPY, +#endif #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 880227b9f0440..7b879c77bec5f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -35,17 +35,6 @@ struct notifier_block; /* in notifier.h */ #define VM_DEFER_KMEMLEAK 0 #endif -/* - * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. - * - * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after - * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poison shadow on free if it was never allocated. - * - * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to - * determine which allocations need the module shadow freed. - */ - /* bits [20..32] reserved for arch specific ioremap internals */ /* @@ -126,6 +115,13 @@ static inline int arch_vmap_pte_supported_shift(unsigned long size) } #endif +#ifndef arch_vmap_pgprot_tagged +static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) +{ + return prot; +} +#endif + /* * Highlevel APIs for driver use */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fec248ab1fec5..dc2b94e6a94f0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -345,28 +345,13 @@ void wb_domain_exit(struct wb_domain *dom); extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ -extern int dirty_background_ratio; -extern unsigned long dirty_background_bytes; -extern int vm_dirty_ratio; -extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; -extern int vm_highmem_is_dirtyable; extern int laptop_mode; -int dirty_background_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_background_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_ratio_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int dirty_bytes_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 4fdb14a81108b..d651f3437367d 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -29,7 +29,6 @@ EM( SCAN_VMA_NULL, "vma_null") \ EM( SCAN_VMA_CHECK, "vma_check_failed") \ EM( SCAN_ADDRESS_RANGE, "not_suitable_address_range") \ - EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 779f3fad9ecd5..061b5128f335a 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -105,6 +105,37 @@ TRACE_EVENT(mm_migrate_pages_start, __print_symbolic(__entry->reason, MIGRATE_REASON)) ); +DECLARE_EVENT_CLASS(migration_pte, + + TP_PROTO(unsigned long addr, unsigned long pte, int order), + + TP_ARGS(addr, pte, order), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pte) + __field(int, order) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pte = pte; + __entry->order = order; + ), + + TP_printk("addr=%lx, pte=%lx order=%d", __entry->addr, __entry->pte, __entry->order) +); + +DEFINE_EVENT(migration_pte, set_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + +DEFINE_EVENT(migration_pte, remove_migration_pte, + TP_PROTO(unsigned long addr, unsigned long pte, int order), + TP_ARGS(addr, pte, order) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 116ed4d5d0f88..0698c5d0f1947 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -29,7 +29,6 @@ {(unsigned long)__GFP_HIGHMEM, "__GFP_HIGHMEM"}, \ {(unsigned long)GFP_DMA32, "GFP_DMA32"}, \ {(unsigned long)__GFP_HIGH, "__GFP_HIGH"}, \ - {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ @@ -49,12 +48,20 @@ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"}, \ - {(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\ + {(unsigned long)__GFP_ZEROTAGS, "__GFP_ZEROTAGS"} \ + +#ifdef CONFIG_KASAN_HW_TAGS +#define __def_gfpflag_names_kasan , \ + {(unsigned long)__GFP_SKIP_ZERO, "__GFP_SKIP_ZERO"}, \ + {(unsigned long)__GFP_SKIP_KASAN_POISON, "__GFP_SKIP_KASAN_POISON"}, \ + {(unsigned long)__GFP_SKIP_KASAN_UNPOISON, "__GFP_SKIP_KASAN_UNPOISON"} +#else +#define __def_gfpflag_names_kasan +#endif #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ - __def_gfpflag_names \ + __def_gfpflag_names __def_gfpflag_names_kasan \ ) : "none" #ifdef CONFIG_MMU diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index ca3f2767828a6..202b3e3e67ff2 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -48,6 +48,33 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DECLARE_EVENT_CLASS(migration_pmd, + + TP_PROTO(unsigned long addr, unsigned long pmd), + + TP_ARGS(addr, pmd), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, pmd) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pmd = pmd; + ), + TP_printk("addr=%lx, pmd=%lx", __entry->addr, __entry->pmd) +); + +DEFINE_EVENT(migration_pmd, set_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); + +DEFINE_EVENT(migration_pmd, remove_migration_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); #endif /* _TRACE_THP_H */ /* This part must be outside protection */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a345b1e12daf3..86b2a82da546a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -735,34 +735,6 @@ TRACE_EVENT(writeback_sb_inodes_requeue, ) ); -DECLARE_EVENT_CLASS(writeback_congest_waited_template, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed), - - TP_STRUCT__entry( - __field( unsigned int, usec_timeout ) - __field( unsigned int, usec_delayed ) - ), - - TP_fast_assign( - __entry->usec_timeout = usec_timeout; - __entry->usec_delayed = usec_delayed; - ), - - TP_printk("usec_timeout=%u usec_delayed=%u", - __entry->usec_timeout, - __entry->usec_delayed) -); - -DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, - - TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), - - TP_ARGS(usec_timeout, usec_delayed) -); - DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, diff --git a/init/main.c b/init/main.c index ada50f5a15e43..e2a046a199127 100644 --- a/init/main.c +++ b/init/main.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -474,7 +474,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL, NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -503,7 +503,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -528,7 +529,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -728,7 +730,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1354,8 +1357,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1492,7 +1497,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 5becca9be867c..089c34d0732cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07b..eed2f7437d963 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,7 +173,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +201,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b219..57d624f05182e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -254,6 +254,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) * so cache the vm_struct. */ if (stack) { + stack = kasan_reset_tag(stack); tsk->stack_vm_area = find_vm_area(stack); tsk->stack = stack; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 52501e5f76554..40220dfd6fa93 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -147,6 +147,47 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); } +static void check_killed_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long stamp = t->killed_time; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + /* + * Skip threads which are already inside do_exit(), for exit_mm() etc. + * might take many seconds. + */ + if (t->flags & PF_EXITING) + return; + if (!stamp) { + stamp = jiffies; + if (!stamp) + stamp++; + t->killed_time = stamp; + return; + } + if (time_is_after_jiffies(stamp + timeout * HZ)) + return; + trace_sched_process_hang(t); + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_call_panic = true; + } + /* + * This thread failed to terminate for more than + * sysctl_hung_task_timeout_secs seconds, complain: + */ + pr_err("INFO: task %s:%d can't die for more than %ld seconds.\n", + t->comm, t->pid, (jiffies - stamp) / HZ); + sched_show_task(t); + hung_task_show_lock = true; + touch_nmi_watchdog(); +} + /* * To avoid extending the RCU grace period for an unbounded amount of time, * periodically exit the critical section and enter a new one. @@ -198,6 +239,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } + /* Check threads which are about to terminate. */ + if (unlikely(fatal_signal_pending(t))) + check_killed_task(t, timeout); /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) check_hung_task(t, timeout); diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e7..475524bd900ab 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; + void *area; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); + return -EBUSY; + } + kcov->area = area; + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { diff --git a/kernel/panic.c b/kernel/panic.c index 55b50e052ec3a..3c3fb36d8d414 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_PRINT_ALL_CPU_BT 0x00000040 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -152,6 +153,9 @@ static void panic_print_sys_info(void) if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_ALL_CPU_BT) + trigger_all_cpu_backtrace(); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -185,6 +189,16 @@ void panic(const char *fmt, ...) int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + } + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -576,16 +590,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } if (!regs) dump_stack(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a506685e878aa..28d1b7af03dcb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4280,7 +4280,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING -void set_numabalancing_state(bool enabled) +int sysctl_numa_balancing_mode; + +static void __set_numabalancing_state(bool enabled) { if (enabled) static_branch_enable(&sched_numa_balancing); @@ -4288,13 +4290,22 @@ void set_numabalancing_state(bool enabled) static_branch_disable(&sched_numa_balancing); } +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL; + else + sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED; + __set_numabalancing_state(enabled); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; - int state = static_branch_likely(&sched_numa_balancing); + int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4304,8 +4315,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_numabalancing_state(state); + if (write) { + sysctl_numa_balancing_mode = state; + __set_numabalancing_state(state); + } return err; } #endif diff --git a/kernel/scs.c b/kernel/scs.c index 579841be88646..b83bc9251f996 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -32,7 +32,7 @@ static void *__scs_alloc(int node) for (i = 0; i < NR_CACHED_SCS; i++) { s = this_cpu_xchg(scs_cache[i], NULL); if (s) { - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); memset(s, 0, SCS_SIZE); return s; } @@ -78,7 +78,7 @@ void scs_free(void *s) if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) return; - kasan_unpoison_vmalloc(s, SCS_SIZE); + kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_NONE); vfree_atomic(s); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6a494ab55beaf..788b9a34d5ab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,8 +100,6 @@ static const int six_hundred_forty_kb = 640 * 1024; #endif -/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1698,7 +1696,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, }, #endif /* CONFIG_NUMA_BALANCING */ { @@ -2399,55 +2397,6 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { - .procname = "dirty_background_ratio", - .data = &dirty_background_ratio, - .maxlen = sizeof(dirty_background_ratio), - .mode = 0644, - .proc_handler = dirty_background_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_background_bytes", - .data = &dirty_background_bytes, - .maxlen = sizeof(dirty_background_bytes), - .mode = 0644, - .proc_handler = dirty_background_bytes_handler, - .extra1 = SYSCTL_LONG_ONE, - }, - { - .procname = "dirty_ratio", - .data = &vm_dirty_ratio, - .maxlen = sizeof(vm_dirty_ratio), - .mode = 0644, - .proc_handler = dirty_ratio_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "dirty_bytes", - .data = &vm_dirty_bytes, - .maxlen = sizeof(vm_dirty_bytes), - .mode = 0644, - .proc_handler = dirty_bytes_handler, - .extra1 = (void *)&dirty_bytes_min, - }, - { - .procname = "dirty_writeback_centisecs", - .data = &dirty_writeback_interval, - .maxlen = sizeof(dirty_writeback_interval), - .mode = 0644, - .proc_handler = dirty_writeback_centisecs_handler, - }, - { - .procname = "dirty_expire_centisecs", - .data = &dirty_expire_interval, - .maxlen = sizeof(dirty_expire_interval), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "dirtytime_expire_seconds", .data = &dirtytime_expire_interval, @@ -2619,13 +2568,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif - { - .procname = "laptop_mode", - .data = &laptop_mode, - .maxlen = sizeof(laptop_mode), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, @@ -2723,17 +2665,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, }, #endif -#ifdef CONFIG_HIGHMEM - { - .procname = "highmem_is_dirtyable", - .data = &vm_highmem_is_dirtyable, - .maxlen = sizeof(vm_highmem_is_dirtyable), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6613bcf460c69..fa933475826f5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -208,20 +208,87 @@ config DEBUG_BUGVERBOSE endmenu # "printk and dmesg options" +config DEBUG_KERNEL + bool "Kernel debugging" + help + Say Y here if you are developing drivers or trying to debug and + identify kernel problems. + +config DEBUG_MISC + bool "Miscellaneous debug code" + default DEBUG_KERNEL + depends on DEBUG_KERNEL + help + Say Y here if you need to enable miscellaneous debug code that should + be under a more specific debug option but isn't. + menu "Compile-time checks and compiler options" config DEBUG_INFO - bool "Compile the kernel with debug info" - depends on DEBUG_KERNEL && !COMPILE_TEST + bool help - If you say Y here the resulting kernel image will include - debugging info resulting in a larger kernel image. + A kernel debug info option other than "None" has been selected + in the "Debug information" choice below, indicating that debug + information will be generated for build targets. + +choice + prompt "Debug information" + depends on DEBUG_KERNEL + help + Selecting something other than "None" results in a kernel image + that will include debugging info resulting in a larger kernel image. This adds debug symbols to the kernel and modules (gcc -g), and is needed if you intend to use kernel crashdump or binary object tools like crash, kgdb, LKCD, gdb, etc on the kernel. - Say Y here only if you plan to debug the kernel. - If unsure, say N. + Choose which version of DWARF debug info to emit. If unsure, + select "Toolchain default". + +config DEBUG_INFO_NONE + bool "Disable debug information" + help + Do not build the kernel with debugging information, which will + result in a faster and smaller build. + +config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT + bool "Rely on the toolchain's implicit default DWARF version" + select DEBUG_INFO + help + The implicit default version of DWARF debug info produced by a + toolchain changes over time. + + This can break consumers of the debug info that haven't upgraded to + support newer revisions, and prevent testing newer versions, but + those should be less common scenarios. + +config DEBUG_INFO_DWARF4 + bool "Generate DWARF Version 4 debuginfo" + select DEBUG_INFO + help + Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. + + If you have consumers of DWARF debug info that are not ready for + newer revisions of DWARF, you may wish to choose this or have your + config select this. + +config DEBUG_INFO_DWARF5 + bool "Generate DWARF Version 5 debuginfo" + select DEBUG_INFO + depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) + help + Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc + 5.0+ accepts the -gdwarf-5 flag but only had partial support for some + draft features until 7.0), and gdb 8.0+. + + Changes to the structure of debug info in Version 5 allow for around + 15-18% savings in resulting image and debug info section sizes as + compared to DWARF Version 4. DWARF Version 5 standardizes previous + extensions such as accelerators for symbol indexing and the format + for fission (.dwo/.dwp) files. Users may not want to select this + config if they rely on tooling that has not yet been updated to + support DWARF Version 5. + +endchoice # "Debug information" if DEBUG_INFO @@ -267,56 +334,12 @@ config DEBUG_INFO_SPLIT to know about the .dwo files and include them. Incompatible with older versions of ccache. -choice - prompt "DWARF version" - help - Which version of DWARF debug info to emit. - -config DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT - bool "Rely on the toolchain's implicit default DWARF version" - help - The implicit default version of DWARF debug info produced by a - toolchain changes over time. - - This can break consumers of the debug info that haven't upgraded to - support newer revisions, and prevent testing newer versions, but - those should be less common scenarios. - - If unsure, say Y. - -config DEBUG_INFO_DWARF4 - bool "Generate DWARF Version 4 debuginfo" - help - Generate DWARF v4 debug info. This requires gcc 4.5+ and gdb 7.0+. - - If you have consumers of DWARF debug info that are not ready for - newer revisions of DWARF, you may wish to choose this or have your - config select this. - -config DEBUG_INFO_DWARF5 - bool "Generate DWARF Version 5 debuginfo" - depends on !CC_IS_CLANG || (CC_IS_CLANG && (AS_IS_LLVM || (AS_IS_GNU && AS_VERSION >= 23502))) - depends on !DEBUG_INFO_BTF || PAHOLE_VERSION >= 121 - help - Generate DWARF v5 debug info. Requires binutils 2.35.2, gcc 5.0+ (gcc - 5.0+ accepts the -gdwarf-5 flag but only had partial support for some - draft features until 7.0), and gdb 8.0+. - - Changes to the structure of debug info in Version 5 allow for around - 15-18% savings in resulting image and debug info section sizes as - compared to DWARF Version 4. DWARF Version 5 standardizes previous - extensions such as accelerators for symbol indexing and the format - for fission (.dwo/.dwp) files. Users may not want to select this - config if they rely on tooling that has not yet been updated to - support DWARF Version 5. - -endchoice # "DWARF version" - config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL + depends on !DEBUG_INFO_DWARF5 || PAHOLE_VERSION >= 121 help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert @@ -593,20 +616,6 @@ source "lib/Kconfig.kcsan" endmenu -config DEBUG_KERNEL - bool "Kernel debugging" - help - Say Y here if you are developing drivers or trying to debug and - identify kernel problems. - -config DEBUG_MISC - bool "Miscellaneous debug code" - default DEBUG_KERNEL - depends on DEBUG_KERNEL - help - Say Y here if you need to enable miscellaneous debug code that should - be under a more specific debug option but isn't. - menu "Networking Debugging" source "net/Kconfig.debug" @@ -1797,6 +1806,12 @@ config IO_STRICT_DEVMEM menu "$(SRCARCH) Debugging" +config DEBUG_AID_FOR_SYZBOT + bool "Additional debug code for syzbot" + default n + help + This option is intended for testing by syzbot. + source "arch/$(SRCARCH)/Kconfig.debug" endmenu diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 879757b6dd149..1f3e620188a24 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -178,17 +178,17 @@ config KASAN_TAGS_IDENTIFY memory consumption. config KASAN_VMALLOC - bool "Back mappings in vmalloc space with real shadow memory" - depends on KASAN_GENERIC && HAVE_ARCH_KASAN_VMALLOC + bool "Check accesses to vmalloc allocations" + depends on HAVE_ARCH_KASAN_VMALLOC help - By default, the shadow region for vmalloc space is the read-only - zero page. This means that KASAN cannot detect errors involving - vmalloc space. - - Enabling this option will hook in to vmap/vmalloc and back those - mappings with real shadow memory allocated on demand. This allows - for KASAN to detect more sorts of errors (and to support vmapped - stacks), but at the cost of higher memory usage. + This mode makes KASAN check accesses to vmalloc allocations for + validity. + + With software KASAN modes, checking is done for all types of vmalloc + allocations. Enabling this option leads to higher memory usage. + + With hardware tag-based KASAN, only VM_ALLOC mappings are checked. + There is no additional memory usage. config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index 63b70b8c55519..de022445fbba5 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -10,21 +10,10 @@ config HAVE_KCSAN_COMPILER For the list of compilers that support KCSAN, please see . -config KCSAN_KCOV_BROKEN - def_bool KCOV && CC_HAS_SANCOV_TRACE_PC - depends on CC_IS_CLANG - depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=thread -fsanitize-coverage=trace-pc) - help - Some versions of clang support either KCSAN and KCOV but not the - combination of the two. - See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status - in newer releases. - menuconfig KCSAN bool "KCSAN: dynamic data race detector" depends on HAVE_ARCH_KCSAN && HAVE_KCSAN_COMPILER depends on DEBUG_KERNEL && !KASAN - depends on !KCSAN_KCOV_BROKEN select STACKTRACE help The Kernel Concurrency Sanitizer (KCSAN) is a dynamic diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 236c5cefc4cc5..f3c57ed518381 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -27,16 +27,6 @@ config UBSAN_TRAP the system. For some system builders this is an acceptable trade-off. -config UBSAN_KCOV_BROKEN - def_bool KCOV && CC_HAS_SANCOV_TRACE_PC - depends on CC_IS_CLANG - depends on !$(cc-option,-Werror=unused-command-line-argument -fsanitize=bounds -fsanitize-coverage=trace-pc) - help - Some versions of clang support either UBSAN or KCOV but not the - combination of the two. - See https://bugs.llvm.org/show_bug.cgi?id=45831 for the status - in newer releases. - config CC_HAS_UBSAN_BOUNDS def_bool $(cc-option,-fsanitize=bounds) @@ -46,7 +36,6 @@ config CC_HAS_UBSAN_ARRAY_BOUNDS config UBSAN_BOUNDS bool "Perform array index bounds checking" default UBSAN - depends on !UBSAN_KCOV_BROKEN depends on CC_HAS_UBSAN_ARRAY_BOUNDS || CC_HAS_UBSAN_BOUNDS help This option enables detection of directly indexed out of bounds @@ -72,7 +61,6 @@ config UBSAN_ARRAY_BOUNDS config UBSAN_LOCAL_BOUNDS bool "Perform array local bounds checking" depends on UBSAN_TRAP - depends on !UBSAN_KCOV_BROKEN depends on $(cc-option,-fsanitize=local-bounds) help This option enables -fsanitize=local-bounds which traps when an diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 926f4823d5eac..fd1728d94babb 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -271,8 +271,12 @@ static FORCE_INLINE int LZ4_decompress_generic( ip += length; op += length; - /* Necessarily EOF, due to parsing restrictions */ - if (!partialDecoding || (cpy == oend)) + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) break; } else { /* may overwrite up to WILDCOPYLENGTH beyond cpy */ diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 767538089a62e..dedce7908ac63 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -29,11 +29,32 @@ #include "test_hmm_uapi.h" -#define DMIRROR_NDEVICES 2 +#define DMIRROR_NDEVICES 4 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +/* + * For device_private pages, dpage is just a dummy struct page + * representing a piece of device memory. dmirror_devmem_alloc_page + * allocates a real system memory page as backing storage to fake a + * real device. zone_device_data points to that backing page. But + * for device_coherent memory, the struct page represents real + * physical CPU-accessible memory that we can use directly. + */ +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \ + (page)->zone_device_data : (page)) + +static unsigned long spm_addr_dev0; +module_param(spm_addr_dev0, long, 0644); +MODULE_PARM_DESC(spm_addr_dev0, + "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + +static unsigned long spm_addr_dev1; +module_param(spm_addr_dev1, long, 0644); +MODULE_PARM_DESC(spm_addr_dev1, + "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + static const struct dev_pagemap_ops dmirror_devmem_ops; static const struct mmu_interval_notifier_ops dmirror_min_ops; static dev_t dmirror_dev; @@ -84,6 +105,7 @@ struct dmirror_chunk { struct dmirror_device { struct cdev cdevice; struct hmm_devmem *devmem; + unsigned int zone_device_type; unsigned int devmem_capacity; unsigned int devmem_count; @@ -111,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, return 0; } +static bool dmirror_is_private_zone(struct dmirror_device *mdevice) +{ + return (mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; +} + +static enum migrate_vma_direction + dmirror_select_device(struct dmirror *dmirror) +{ + return (dmirror->mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : + MIGRATE_VMA_SELECT_DEVICE_COHERENT; +} + static void dmirror_bounce_fini(struct dmirror_bounce *bounce) { vfree(bounce->ptr); @@ -451,28 +488,44 @@ static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) return ret; } -static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, +static int dmirror_allocate_chunk(struct dmirror_device *mdevice, struct page **ppage) { struct dmirror_chunk *devmem; - struct resource *res; + struct resource *res = NULL; unsigned long pfn; unsigned long pfn_first; unsigned long pfn_last; void *ptr; + int ret = -ENOMEM; devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem) - return false; + return ret; - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR(res)) + switch (mdevice->zone_device_type) { + case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE: + res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, + "hmm_dmirror"); + if (IS_ERR_OR_NULL(res)) + goto err_devmem; + devmem->pagemap.range.start = res->start; + devmem->pagemap.range.end = res->end; + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + break; + case HMM_DMIRROR_MEMORY_DEVICE_COHERENT: + devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ? + spm_addr_dev0 : + spm_addr_dev1; + devmem->pagemap.range.end = devmem->pagemap.range.start + + DEVMEM_CHUNK_SIZE - 1; + devmem->pagemap.type = MEMORY_DEVICE_COHERENT; + break; + default: + ret = -EINVAL; goto err_devmem; + } - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.range.start = res->start; - devmem->pagemap.range.end = res->end; devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; @@ -493,10 +546,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); - if (IS_ERR(ptr)) + if (IS_ERR_OR_NULL(ptr)) { + if (ptr) + ret = PTR_ERR(ptr); + else + ret = -EFAULT; goto err_release; + } devmem->mdevice = mdevice; pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; @@ -525,30 +582,35 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, } spin_unlock(&mdevice->lock); - return true; + return 0; err_release: mutex_unlock(&mdevice->devmem_lock); - release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); + if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); err_devmem: kfree(devmem); - return false; + return ret; } static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) { struct page *dpage = NULL; - struct page *rpage; + struct page *rpage = NULL; /* - * This is a fake device so we alloc real system memory to store - * our device memory. + * For ZONE_DEVICE private type, this is a fake device so we alloc real + * system memory to store our device memory. + * For ZONE_DEVICE coherent type we use the actual dpage to store the data + * and ignore rpage. */ - rpage = alloc_page(GFP_HIGHUSER); - if (!rpage) - return NULL; - + if (dmirror_is_private_zone(mdevice)) { + rpage = alloc_page(GFP_HIGHUSER); + if (!rpage) + return NULL; + } spin_lock(&mdevice->lock); if (mdevice->free_pages) { @@ -558,7 +620,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) spin_unlock(&mdevice->lock); } else { spin_unlock(&mdevice->lock); - if (!dmirror_allocate_chunk(mdevice, &dpage)) + if (dmirror_allocate_chunk(mdevice, &dpage)) goto error; } @@ -568,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) return dpage; error: - __free_page(rpage); + if (rpage) + __free_page(rpage); return NULL; } @@ -594,12 +657,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * unallocated pte_none() or read-only zero page. */ spage = migrate_pfn_to_page(*src); + if (WARN(spage && is_zone_device_page(spage), + "page already in device spage pfn: 0x%lx\n", + page_to_pfn(spage))) + continue; dpage = dmirror_devmem_alloc_page(mdevice); if (!dpage) continue; - rpage = dpage->zone_device_data; + rpage = BACKING_PAGE(dpage); if (spage) copy_highpage(rpage, spage); else @@ -613,6 +680,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ rpage->zone_device_data = dmirror; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); *dst = migrate_pfn(page_to_pfn(dpage)); if ((*src & MIGRATE_PFN_WRITE) || (!spage && args->vma->vm_flags & VM_WRITE)) @@ -690,11 +759,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - /* - * Store the page that holds the data so the page table - * doesn't have to deal with ZONE_DEVICE private pages. - */ - entry = dpage->zone_device_data; + entry = BACKING_PAGE(dpage); if (*dst & MIGRATE_PFN_WRITE) entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); @@ -774,15 +839,124 @@ static int dmirror_exclusive(struct dmirror *dmirror, return ret; } -static int dmirror_migrate(struct dmirror *dmirror, - struct hmm_dmirror_cmd *cmd) +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, + struct dmirror *dmirror) +{ + const unsigned long *src = args->src; + unsigned long *dst = args->dst; + unsigned long start = args->start; + unsigned long end = args->end; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE, + src++, dst++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(*src); + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_dev_private_or_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + continue; + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); + + lock_page(dpage); + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dpage, spage); + *dst = migrate_pfn(page_to_pfn(dpage)); + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + } + return 0; +} + +static unsigned long dmirror_successful_migrated_pages(struct migrate_vma *migrate) +{ + unsigned long cpages = 0; + unsigned long i; + + for (i = 0; i < migrate->npages; i++) { + if (migrate->src[i] & MIGRATE_PFN_VALID && + migrate->src[i] & MIGRATE_PFN_MIGRATE) + cpages++; + } + return cpages; +} + +static int dmirror_migrate_to_system(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + unsigned long start, end, addr; + unsigned long size = cmd->npages << PAGE_SHIFT; + struct mm_struct *mm = dmirror->notifier.mm; + struct vm_area_struct *vma; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; + struct migrate_vma args; + unsigned long next; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + /* Since the mm is for the mirrored process, get a reference first. */ + if (!mmget_not_zero(mm)) + return -EINVAL; + + cmd->cpages = 0; + mmap_read_lock(mm); + for (addr = start; addr < end; addr = next) { + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_READ)) { + ret = -EINVAL; + goto out; + } + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + if (next > vma->vm_end) + next = vma->vm_end; + + args.vma = vma; + args.src = src_pfns; + args.dst = dst_pfns; + args.start = addr; + args.end = next; + args.pgmap_owner = dmirror->mdevice; + args.flags = dmirror_select_device(dmirror); + + ret = migrate_vma_setup(&args); + if (ret) + goto out; + + pr_debug("Migrating from device mem to sys mem\n"); + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + + migrate_vma_pages(&args); + cmd->cpages += dmirror_successful_migrated_pages(&args); + migrate_vma_finalize(&args); + } +out: + mmap_read_unlock(mm); + mmput(mm); + + return ret; +} + +static int dmirror_migrate_to_device(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) { unsigned long start, end, addr; unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[64]; - unsigned long dst_pfns[64]; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args; unsigned long next; @@ -819,6 +993,7 @@ static int dmirror_migrate(struct dmirror *dmirror, if (ret) goto out; + pr_debug("Migrating from sys mem to device mem\n"); dmirror_migrate_alloc_and_copy(&args, dmirror); migrate_vma_pages(&args); dmirror_migrate_finalize_and_map(&args, dmirror); @@ -827,7 +1002,7 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_unlock(mm); mmput(mm); - /* Return the migrated data for verification. */ + /* Return the migrated data for verification. only for pages in device zone */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) return ret; @@ -864,12 +1039,22 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, } page = hmm_pfn_to_page(entry); - if (is_device_private_page(page)) { - /* Is the page migrated to this device or some other? */ - if (dmirror->mdevice == dmirror_page_to_device(page)) + if (is_dev_private_or_coherent_page(page)) { + /* Is page ZONE_DEVICE coherent? */ + if (is_device_coherent_page(page)) { + if (dmirror->mdevice == dmirror_page_to_device(page)) + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; + else + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; + /* + * Is page ZONE_DEVICE private migrated to + * this device or some other? + */ + } else if (dmirror->mdevice == dmirror_page_to_device(page)) { *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; - else + } else { *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; + } } else if (is_zero_pfn(page_to_pfn(page))) *perm = HMM_DMIRROR_PROT_ZERO; else @@ -1024,6 +1209,15 @@ static int dmirror_snapshot(struct dmirror *dmirror, return ret; } +static int dmirror_get_device_type(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + mutex_lock(&dmirror->mutex); + cmd->zone_device_type = dmirror->mdevice->zone_device_type; + mutex_unlock(&dmirror->mutex); + + return 0; +} static long dmirror_fops_unlocked_ioctl(struct file *filp, unsigned int command, unsigned long arg) @@ -1057,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_write(dmirror, &cmd); break; - case HMM_DMIRROR_MIGRATE: - ret = dmirror_migrate(dmirror, &cmd); + case HMM_DMIRROR_MIGRATE_TO_DEV: + ret = dmirror_migrate_to_device(dmirror, &cmd); + break; + + case HMM_DMIRROR_MIGRATE_TO_SYS: + ret = dmirror_migrate_to_system(dmirror, &cmd); break; case HMM_DMIRROR_EXCLUSIVE: @@ -1074,6 +1272,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_snapshot(dmirror, &cmd); break; + case HMM_DMIRROR_GET_MEM_DEV_TYPE: + ret = dmirror_get_device_type(dmirror, &cmd); + break; default: return -EINVAL; } @@ -1120,14 +1321,13 @@ static const struct file_operations dmirror_fops = { static void dmirror_devmem_free(struct page *page) { - struct page *rpage = page->zone_device_data; + struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; - if (rpage) + if (rpage != page) __free_page(rpage); mdevice = dmirror_page_to_device(page); - spin_lock(&mdevice->lock); mdevice->cfree++; page->zone_device_data = mdevice->free_pages; @@ -1135,43 +1335,11 @@ static void dmirror_devmem_free(struct page *page) spin_unlock(&mdevice->lock); } -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, - struct dmirror *dmirror) -{ - const unsigned long *src = args->src; - unsigned long *dst = args->dst; - unsigned long start = args->start; - unsigned long end = args->end; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; - spage = spage->zone_device_data; - - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)); - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - } - return 0; -} - static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args; - unsigned long src_pfns; - unsigned long dst_pfns; + unsigned long src_pfns = 0; + unsigned long dst_pfns = 0; struct page *rpage; struct dmirror *dmirror; vm_fault_t ret; @@ -1191,7 +1359,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.src = &src_pfns; args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.flags = dmirror_select_device(dmirror); if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; @@ -1229,10 +1397,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) if (ret) return ret; - /* Build a list of free ZONE_DEVICE private struct pages */ - dmirror_allocate_chunk(mdevice, NULL); - - return 0; + /* Build a list of free ZONE_DEVICE struct pages */ + return dmirror_allocate_chunk(mdevice, NULL); } static void dmirror_device_remove(struct dmirror_device *mdevice) @@ -1245,8 +1411,9 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) mdevice->devmem_chunks[i]; memunmap_pages(&devmem->pagemap); - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); kfree(devmem); } kfree(mdevice->devmem_chunks); @@ -1258,14 +1425,26 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) static int __init hmm_dmirror_init(void) { int ret; - int id; + int id = 0; + int ndevices = 0; ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, "HMM_DMIRROR"); if (ret) goto err_unreg; - for (id = 0; id < DMIRROR_NDEVICES; id++) { + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + if (spm_addr_dev0 && spm_addr_dev1) { + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + } + for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) goto err_chrdev; @@ -1287,7 +1466,8 @@ static void __exit hmm_dmirror_exit(void) int id; for (id = 0; id < DMIRROR_NDEVICES; id++) - dmirror_device_remove(dmirror_devices + id); + if (dmirror_devices[id].zone_device_type) + dmirror_device_remove(dmirror_devices + id); unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index f14dea5dcd062..e190b2ab6f199 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -19,6 +19,7 @@ * @npages: (in) number of pages to read/write * @cpages: (out) number of pages copied * @faults: (out) number of device page faults seen + * @zone_device_type: (out) zone device memory type */ struct hmm_dmirror_cmd { __u64 addr; @@ -26,15 +27,18 @@ struct hmm_dmirror_cmd { __u64 npages; __u64 cpages; __u64 faults; + __u64 zone_device_type; }; /* Expose the address space of the calling process through hmm device file */ #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_GET_MEM_DEV_TYPE _IOWR('H', 0x07, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. @@ -49,6 +53,8 @@ struct hmm_dmirror_cmd { * device the ioctl() is made * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some * other device + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device + * the ioctl() is made */ enum { HMM_DMIRROR_PROT_ERROR = 0xFF, @@ -60,6 +66,14 @@ enum { HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40, + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50, +}; + +enum { + /* 0 is reserved to catch uninitialized type fields */ + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1, + HMM_DMIRROR_MEMORY_DEVICE_COHERENT, }; #endif /* _LIB_TEST_HMM_UAPI_H */ diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 26a5c9007653a..4311cf5319557 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -869,11 +870,14 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void empty_cache_ctor(void *object) { } + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; - cache = kmem_cache_create("test_cache", 200, 0, 0, NULL); + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); kmem_cache_destroy(cache); KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); @@ -1054,21 +1058,183 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + int rv; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + + vfree(ptr); +} + static void vmalloc_oob(struct kunit *test) { - void *area; + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + OPTIMIZER_HIDE_VAR(v_ptr); + /* - * We have to be careful not to hit the guard page. + * We have to be careful not to hit the guard page in vmalloc tests. * The MMU will catch that and crash us. */ - area = vmalloc(3000); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, area); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)area)[3100]); - vfree(area); + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1 << order, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, order); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + size_t order = 1; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, order); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1 << order, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1 << order); + free_pages((unsigned long)p_ptr, order); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); } /* @@ -1102,6 +1268,18 @@ static void match_all_not_assigned(struct kunit *test) KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); free_pages((unsigned long)ptr, order); } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } } /* Check that 0xff works as a match-all pointer tag for tag-based modes. */ @@ -1207,7 +1385,11 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), diff --git a/lib/test_printf.c b/lib/test_printf.c index 07309c45f3279..8010de49b6c5d 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -673,17 +673,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/lib/ubsan.c b/lib/ubsan.c index bdc380ff5d5c7..36bd75e334263 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -154,16 +154,8 @@ static void ubsan_epilogue(void) current->in_ubsan--; - if (panic_on_warn) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn) panic("panic_on_warn set ...\n"); - } } void __ubsan_handle_divrem_overflow(void *_data, void *lhs, void *rhs) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index a1babe5e07d10..229cbc666e020 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -986,9 +986,7 @@ char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; -#ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; -#endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); @@ -1008,6 +1006,9 @@ char *symbol_string(char *buf, char *end, void *ptr, return string_nocheck(buf, end, sym, spec); #else + if (fill_minimal_module_info(sym, KSYM_SYMBOL_LEN, value)) + return string_nocheck(buf, end, sym, spec); + return special_hex_number(buf, end, value, sizeof(void *)); #endif } @@ -2905,13 +2906,15 @@ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; + if (unlikely(!size)) + return 0; + i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; - if (size != 0) - return size - 1; - return 0; + + return size - 1; } EXPORT_SYMBOL(vscnprintf); diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f33..edb6ced1c8e4f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -411,6 +411,9 @@ choice benefit. endchoice +config ARCH_WANT_GENERAL_HUGETLB + bool + config ARCH_WANTS_THP_SWAP def_bool n @@ -744,6 +747,9 @@ config IDLE_PAGE_TRACKING config ARCH_HAS_CACHE_LINE_SIZE bool +config ARCH_HAS_FILTER_PGPROT + bool + config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eae96dfe0261c..7176af65b103a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1005,60 +1005,3 @@ const char *bdi_dev_name(struct backing_dev_info *bdi) return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); - -static wait_queue_head_t congestion_wqh[2] = { - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), - __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) - }; -static atomic_t nr_wb_congested[2]; - -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - wait_queue_head_t *wqh = &congestion_wqh[sync]; - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.congested)) - atomic_dec(&nr_wb_congested[sync]); - smp_mb__after_atomic(); - if (waitqueue_active(wqh)) - wake_up(wqh); -} -EXPORT_SYMBOL(clear_bdi_congested); - -void set_bdi_congested(struct backing_dev_info *bdi, int sync) -{ - enum wb_congested_state bit; - - bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.congested)) - atomic_inc(&nr_wb_congested[sync]); -} -EXPORT_SYMBOL(set_bdi_congested); - -/** - * congestion_wait - wait for a backing_dev to become uncongested - * @sync: SYNC or ASYNC IO - * @timeout: timeout in jiffies - * - * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit - * write congestion. If no backing_devs are congested then just wait for the - * next write to be completed. - */ -long congestion_wait(int sync, long timeout) -{ - long ret; - unsigned long start = jiffies; - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[sync]; - - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - ret = io_schedule_timeout(timeout); - finish_wait(wqh, &wait); - - trace_writeback_congestion_wait(jiffies_to_usecs(timeout), - jiffies_to_usecs(jiffies - start)); - - return ret; -} -EXPORT_SYMBOL(congestion_wait); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 907fefde25728..4b8eab4b3f456 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -bool balloon_page_isolate(struct page *page, isolate_mode_t mode) +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); @@ -217,7 +217,7 @@ bool balloon_page_isolate(struct page *page, isolate_mode_t mode) return true; } -void balloon_page_putback(struct page *page) +static void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; @@ -230,7 +230,7 @@ void balloon_page_putback(struct page *page) /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct address_space *mapping, +static int balloon_page_migrate(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { diff --git a/mm/cma.c b/mm/cma.c index bc9ca8f3c4871..766f1b82b5322 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -131,8 +131,10 @@ static void __init cma_activate_area(struct cma *cma) bitmap_free(cma->bitmap); out_error: /* Expose all pages to the buddy, they are useless for CMA. */ - for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) - free_reserved_page(pfn_to_page(pfn)); + if (!cma->reserve_pages_on_error) { + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); @@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); +void __init cma_reserve_pages_on_error(struct cma *cma) +{ + cma->reserve_pages_on_error = true; +} + /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area diff --git a/mm/cma.h b/mm/cma.h index 2c775877eae24..88a0595670b76 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -30,6 +30,7 @@ struct cma { /* kobject requires dynamic object */ struct cma_kobject *cma_kobj; #endif + bool reserve_pages_on_error; }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 7008c3735e99f..b4085deb9fa05 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test) KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); - t = damon_new_target(42); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_add_region(r, t); @@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test) struct damon_ctx *c = damon_new_ctx(); struct damon_target *t; - t = damon_new_target(42); - KUNIT_EXPECT_EQ(test, 42ul, t->id); + t = damon_new_target(); KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); damon_add_target(c, t); @@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test) static void damon_test_aggregate(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long target_ids[] = {1, 2, 3}; unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; @@ -86,7 +84,10 @@ static void damon_test_aggregate(struct kunit *test) struct damon_region *r; int it, ir; - damon_set_targets(ctx, target_ids, 3); + for (it = 0; it < 3; it++) { + t = damon_new_target(); + damon_add_target(ctx, t); + } it = 0; damon_for_each_target(t, ctx) { @@ -122,7 +123,7 @@ static void damon_test_split_at(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); damon_add_region(r, t); damon_split_region_at(c, t, r, 25); @@ -143,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test) struct damon_region *r, *r2, *r3; int i; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; damon_add_region(r, t); @@ -191,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test) unsigned long eaddrs[] = {112, 130, 156, 170, 230}; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; @@ -215,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test) struct damon_target *t; struct damon_region *r; - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); damon_split_regions_of(c, t, 2); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); - t = damon_new_target(42); + t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); damon_split_regions_of(c, t, 4); diff --git a/mm/damon/core.c b/mm/damon/core.c index 1dd153c31c9e2..bf495236d741b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -144,7 +144,7 @@ void damon_destroy_scheme(struct damos *s) * * Returns the pointer to the new struct if success, or NULL otherwise */ -struct damon_target *damon_new_target(unsigned long id) +struct damon_target *damon_new_target(void) { struct damon_target *t; @@ -152,7 +152,7 @@ struct damon_target *damon_new_target(unsigned long id) if (!t) return NULL; - t->id = id; + t->pid = NULL; t->nr_regions = 0; INIT_LIST_HEAD(&t->regions_list); @@ -245,38 +245,6 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } -/** - * damon_set_targets() - Set monitoring targets. - * @ctx: monitoring context - * @ids: array of target ids - * @nr_ids: number of entries in @ids - * - * This function should not be called while the kdamond is running. - * - * Return: 0 on success, negative error code otherwise. - */ -int damon_set_targets(struct damon_ctx *ctx, - unsigned long *ids, ssize_t nr_ids) -{ - ssize_t i; - struct damon_target *t, *next; - - damon_destroy_targets(ctx); - - for (i = 0; i < nr_ids; i++) { - t = damon_new_target(ids[i]); - if (!t) { - /* The caller should do cleanup of the ids itself */ - damon_for_each_target_safe(t, next, ctx) - damon_destroy_target(t); - return -ENOMEM; - } - damon_add_target(ctx, t); - } - - return 0; -} - /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h index 86b9f9528231e..0d3a14c00acfb 100644 --- a/mm/damon/dbgfs-test.h +++ b/mm/damon/dbgfs-test.h @@ -12,66 +12,58 @@ #include -static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) +static void damon_dbgfs_test_str_to_ints(struct kunit *test) { char *question; - unsigned long *answers; - unsigned long expected[] = {12, 35, 46}; + int *answers; + int expected[] = {12, 35, 46}; ssize_t nr_integers = 0, i; question = "123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "123abc"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); - KUNIT_EXPECT_EQ(test, 123ul, answers[0]); + KUNIT_EXPECT_EQ(test, 123, answers[0]); kfree(answers); question = "a123"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "12 35"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); for (i = 0; i < nr_integers; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = "12 35 abc 46"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); for (i = 0; i < 2; i++) KUNIT_EXPECT_EQ(test, expected[i], answers[i]); kfree(answers); question = ""; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); question = "\n"; - answers = str_to_target_ids(question, strlen(question), - &nr_integers); + answers = str_to_ints(question, strlen(question), &nr_integers); KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); kfree(answers); } @@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test) static void damon_dbgfs_test_set_targets(struct kunit *test) { struct damon_ctx *ctx = dbgfs_new_ctx(); - unsigned long ids[] = {1, 2, 3}; char buf[64]; - /* Make DAMON consider target id as plain number */ - ctx->primitive.target_valid = NULL; - ctx->primitive.cleanup = NULL; + /* Make DAMON consider target has no pid */ + ctx->primitive = (struct damon_primitive){}; - damon_set_targets(ctx, ids, 3); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n"); - - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); - damon_set_targets(ctx, (unsigned long []){1, 2}, 2); - sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n"); - - damon_set_targets(ctx, (unsigned long []){2}, 1); + dbgfs_set_targets(ctx, 1, NULL); sprint_target_ids(ctx, buf, 64); - KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n"); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); sprint_target_ids(ctx, buf, 64); KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); @@ -112,25 +94,24 @@ static void damon_dbgfs_test_set_targets(struct kunit *test) static void damon_dbgfs_test_set_init_regions(struct kunit *test) { struct damon_ctx *ctx = damon_new_ctx(); - unsigned long ids[] = {1, 2, 3}; - /* Each line represents one region in `` `` */ - char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45", - "2 10 20\n", - "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n", + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", ""}; /* Reading the file again will show sorted, clean output */ - char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n", - "2 10 20\n", - "1 39 59\n1 70 134\n2 10 20\n2 20 25\n", + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", ""}; - char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */ - "2 10 20\n 2 14 26\n", /* regions overlap */ - "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */ + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ char *input, *expect; int i, rc; char buf[256]; - damon_set_targets(ctx, ids, 3); + dbgfs_set_targets(ctx, 3, NULL); /* Put valid inputs and check the results */ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { @@ -158,12 +139,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test) KUNIT_EXPECT_STREQ(test, (char *)buf, ""); } - damon_set_targets(ctx, NULL, 0); + dbgfs_set_targets(ctx, 0, NULL); damon_destroy_ctx(ctx); } static struct kunit_case damon_test_cases[] = { - KUNIT_CASE(damon_dbgfs_test_str_to_target_ids), + KUNIT_CASE(damon_dbgfs_test_str_to_ints), KUNIT_CASE(damon_dbgfs_test_set_targets), KUNIT_CASE(damon_dbgfs_test_set_init_regions), {}, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 5b899601e56c3..78ff645433c64 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -275,7 +275,7 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, return ret; } -static inline bool targetid_is_pid(const struct damon_ctx *ctx) +static inline bool target_has_pid(const struct damon_ctx *ctx) { return ctx->primitive.target_valid == damon_va_target_valid; } @@ -283,17 +283,19 @@ static inline bool targetid_is_pid(const struct damon_ctx *ctx) static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; - unsigned long id; + int id; int written = 0; int rc; damon_for_each_target(t, ctx) { - id = t->id; - if (targetid_is_pid(ctx)) + if (target_has_pid(ctx)) /* Show pid numbers to debugfs users */ - id = (unsigned long)pid_vnr((struct pid *)id); + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; - rc = scnprintf(&buf[written], len - written, "%lu ", id); + rc = scnprintf(&buf[written], len - written, "%d ", id); if (!rc) return -ENOMEM; written += rc; @@ -321,54 +323,129 @@ static ssize_t dbgfs_target_ids_read(struct file *file, } /* - * Converts a string into an array of unsigned long integers + * Converts a string into an integers array * - * Returns an array of unsigned long integers if the conversion success, or - * NULL otherwise. + * Returns an array of integers array if the conversion success, or NULL + * otherwise. */ -static unsigned long *str_to_target_ids(const char *str, ssize_t len, - ssize_t *nr_ids) +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) { - unsigned long *ids; - const int max_nr_ids = 32; - unsigned long id; + int *array; + const int max_nr_ints = 32; + int nr; int pos = 0, parsed, ret; - *nr_ids = 0; - ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL); - if (!ids) + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) return NULL; - while (*nr_ids < max_nr_ids && pos < len) { - ret = sscanf(&str[pos], "%lu%n", &id, &parsed); + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); pos += parsed; if (ret != 1) break; - ids[*nr_ids] = id; - *nr_ids += 1; + array[*nr_ints] = nr; + *nr_ints += 1; } - return ids; + return array; } -static void dbgfs_put_pids(unsigned long *ids, int nr_ids) +static void dbgfs_put_pids(struct pid **pids, int nr_pids) { int i; - for (i = 0; i < nr_ids; i++) - put_pid((struct pid *)ids[i]); + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; +} + +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) + * + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); + if (!t) { + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); + return -ENOMEM; + } + if (target_has_pid(ctx)) + t->pid = pids[i]; + damon_add_target(ctx, t); + } + + return 0; } static ssize_t dbgfs_target_ids_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - struct damon_target *t, *next_t; bool id_is_pid = true; char *kbuf; - unsigned long *targets; + struct pid **target_pids = NULL; ssize_t nr_targets; ssize_t ret; - int i; kbuf = user_input_str(buf, count, ppos); if (IS_ERR(kbuf)) @@ -376,42 +453,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; - /* target id is meaningless here, but we set it just for fun */ - scnprintf(kbuf, count, "42 "); - } - - targets = str_to_target_ids(kbuf, count, &nr_targets); - if (!targets) { - ret = -ENOMEM; - goto out; + nr_targets = 1; } if (id_is_pid) { - for (i = 0; i < nr_targets; i++) { - targets[i] = (unsigned long)find_get_pid( - (int)targets[i]); - if (!targets[i]) { - dbgfs_put_pids(targets, i); - ret = -EINVAL; - goto free_targets_out; - } + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; } } mutex_lock(&ctx->kdamond_lock); if (ctx->kdamond) { if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); + dbgfs_put_pids(target_pids, nr_targets); ret = -EBUSY; goto unlock_out; } /* remove previously set targets */ - damon_for_each_target_safe(t, next_t, ctx) { - if (targetid_is_pid(ctx)) - put_pid((struct pid *)t->id); - damon_destroy_target(t); - } + dbgfs_set_targets(ctx, 0, NULL); /* Configure the context for the address space type */ if (id_is_pid) @@ -419,18 +481,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, else damon_pa_set_primitives(ctx); - ret = damon_set_targets(ctx, targets, nr_targets); - if (ret) { - if (id_is_pid) - dbgfs_put_pids(targets, nr_targets); - } else { + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); + if (!ret) ret = count; - } unlock_out: mutex_unlock(&ctx->kdamond_lock); -free_targets_out: - kfree(targets); + kfree(target_pids); out: kfree(kbuf); return ret; @@ -440,18 +497,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) { struct damon_target *t; struct damon_region *r; + int target_idx = 0; int written = 0; int rc; damon_for_each_target(t, c) { damon_for_each_region(r, t) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %lu\n", - t->id, r->ar.start, r->ar.end); + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); if (!rc) return -ENOMEM; written += rc; } + target_idx++; } return written; } @@ -485,22 +544,19 @@ static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, return len; } -static int add_init_region(struct damon_ctx *c, - unsigned long target_id, struct damon_addr_range *ar) +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) { struct damon_target *t; struct damon_region *r, *prev; - unsigned long id; + unsigned long idx = 0; int rc = -EINVAL; if (ar->start >= ar->end) return -EINVAL; damon_for_each_target(t, c) { - id = t->id; - if (targetid_is_pid(c)) - id = (unsigned long)pid_vnr((struct pid *)id); - if (id == target_id) { + if (idx++ == target_idx) { r = damon_new_region(ar->start, ar->end); if (!r) return -ENOMEM; @@ -523,7 +579,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) struct damon_target *t; struct damon_region *r, *next; int pos = 0, parsed, ret; - unsigned long target_id; + int target_idx; struct damon_addr_range ar; int err; @@ -533,11 +589,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) } while (pos < len) { - ret = sscanf(&str[pos], "%lu %lu %lu%n", - &target_id, &ar.start, &ar.end, &parsed); + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); if (ret != 3) break; - err = add_init_region(c, target_id, &ar); + err = add_init_region(c, target_idx, &ar); if (err) goto fail; pos += parsed; @@ -660,12 +716,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!targetid_is_pid(ctx)) + if (!target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); damon_for_each_target_safe(t, next, ctx) { - put_pid((struct pid *)t->id); + put_pid(t->pid); damon_destroy_target(t); } mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc476cef688e8..29da37192e4a0 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -387,8 +387,7 @@ static int __init damon_reclaim_init(void) damon_pa_set_primitives(ctx); ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - /* 4242 means nothing but fun */ - target = damon_new_target(4242); + target = damon_new_target(); if (!target) { damon_destroy_ctx(ctx); return -ENOMEM; diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index 6a1b9272ea123..f0d0ba591792c 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, struct damon_region *r; int i; - t = damon_new_target(42); + t = damon_new_target(); for (i = 0; i < nr_regions / 2; i++) { r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); damon_add_region(r, t); @@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test) static void damon_test_split_evenly_fail(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); damon_add_region(r, t); @@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test, static void damon_test_split_evenly_succ(struct kunit *test, unsigned long start, unsigned long end, unsigned int nr_pieces) { - struct damon_target *t = damon_new_target(42); + struct damon_target *t = damon_new_target(); struct damon_region *r = damon_new_region(start, end); unsigned long expected_width = (end - start) / nr_pieces; unsigned long i = 0; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 89b6468da2b9b..6d3454dd3204b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -23,12 +23,12 @@ #endif /* - * 't->id' should be the pointer to the relevant 'struct pid' having reference + * 't->pid' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ static inline struct task_struct *damon_get_task_struct(struct damon_target *t) { - return get_pid_task((struct pid *)t->id, PIDTYPE_PID); + return get_pid_task(t->pid, PIDTYPE_PID); } /* @@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, pte_t entry = huge_ptep_get(pte); struct page *page = pte_page(entry); - if (!page) - return; - get_page(page); if (pte_young(entry)) { @@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, goto out; page = pte_page(entry); - if (!page) - goto out; - get_page(page); if (pte_young(entry) || !page_is_idle(page) || diff --git a/mm/debug.c b/mm/debug.c index bc9ac87f0e08d..8b43dbb2f17be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -265,5 +265,4 @@ void page_init_poison(struct page *page, size_t size) if (page_init_poisoning) memset(page, PAGE_POISON_PATTERN, size); } -EXPORT_SYMBOL_GPL(page_init_poison); #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/fadvise.c b/mm/fadvise.c index d6baa4f451c5f..338f160220129 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!inode_write_congested(mapping->host)) - __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* * First and last FULL page! Partial pages are deliberately diff --git a/mm/filemap.c b/mm/filemap.c index ad8c39d90bf94..90afe301cd527 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2229,8 +2229,9 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages_contig() works exactly like find_get_pages(), except - * that the returned number of pages are guaranteed to be contiguous. + * find_get_pages_contig() works exactly like find_get_pages_range(), + * except that the returned number of pages are guaranteed to be + * contiguous. * * Return: the number of pages which were found. */ @@ -2290,9 +2291,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed * - * Like find_get_pages(), except we only return head pages which are tagged - * with @tag. @index is updated to the index immediately after the last - * page we return, ready for the next iteration. + * Like find_get_pages_range(), except we only return head pages which are + * tagged with @tag. @index is updated to the index immediately after the + * last page we return, ready for the next iteration. * * Return: the number of pages which were found. */ diff --git a/mm/gup.c b/mm/gup.c index a9d4d724aef74..0cf59858114d5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -464,10 +464,6 @@ static struct page *no_page_table(struct vm_area_struct *vma, static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { - /* No page to get reference */ - if (flags & FOLL_GET) - return -EFAULT; - if (flags & FOLL_TOUCH) { pte_t entry = *pte; @@ -597,32 +593,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* Do not mlock pte-mapped THP */ - if (PageTransCompound(page)) - goto out; - - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } out: pte_unmap_unlock(ptep, ptl); return page; @@ -945,9 +915,6 @@ static int faultin_page(struct vm_area_struct *vma, unsigned int fault_flags = 0; vm_fault_t ret; - /* mlock all present pages, but do not fault in new pages */ - if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) - return -ENOENT; if (*flags & FOLL_NOFAULT) return -EFAULT; if (*flags & FOLL_WRITE) @@ -1198,15 +1165,20 @@ static long __get_user_pages(struct mm_struct *mm, case -ENOMEM: case -EHWPOISON: goto out; - case -ENOENT: - goto next_page; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding - * struct page. + * struct page. If the caller expects **pages to be + * filled in, bail out now, because that can't be done + * for this page. */ + if (pages) { + ret = PTR_ERR(page); + goto out; + } + goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); @@ -1497,9 +1469,14 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + /* + * Rightly or wrongly, the VM_LOCKONFAULT case has never used + * faultin_page() to break COW, so it has no work to do here. + */ if (vma->vm_flags & VM_LOCKONFAULT) - gup_flags &= ~FOLL_POPULATE; + return nr_pages; + + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1566,10 +1543,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. - * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT. * !FOLL_FORCE: Require proper access permissions. */ - gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON; + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; if (write) gup_flags |= FOLL_WRITE; @@ -1858,6 +1834,69 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION +#ifdef CONFIG_DEVICE_PRIVATE +/* + * Migrates a device coherent page back to normal memory. Caller should have a + * reference on page which will be copied to the new page if migration is + * successful or dropped on failure. + */ +static struct page *migrate_device_page(struct page *page, + unsigned int gup_flags) +{ + struct page *dpage; + struct migrate_vma args; + unsigned long src_pfn, dst_pfn = 0; + + lock_page(page); + src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + args.src = &src_pfn; + args.dst = &dst_pfn; + args.cpages = 1; + args.npages = 1; + args.vma = NULL; + migrate_vma_setup(&args); + if (!(src_pfn & MIGRATE_PFN_MIGRATE)) + return NULL; + + dpage = alloc_pages(GFP_USER | __GFP_NOWARN, 0); + + /* + * get/pin the new page now so we don't have to retry gup after + * migrating. We already have a reference so this should never fail. + */ + if (dpage && WARN_ON_ONCE(!try_grab_page(dpage, gup_flags))) { + __free_pages(dpage, 0); + dpage = NULL; + } + + if (dpage) { + lock_page(dpage); + dst_pfn = migrate_pfn(page_to_pfn(dpage)); + } + + migrate_vma_pages(&args); + if (src_pfn & MIGRATE_PFN_MIGRATE) + copy_highpage(dpage, page); + migrate_vma_finalize(&args); + if (dpage && !(src_pfn & MIGRATE_PFN_MIGRATE)) { + if (gup_flags & FOLL_PIN) + unpin_user_page(dpage); + else + put_page(dpage); + dpage = NULL; + } + + return dpage; +} +#else +static inline struct page *migrate_device_page(struct page *page, + unsigned int gup_flags) +{ + return NULL; +} +#endif /* CONFIG_DEVICE_PRIVATE */ + + /* * Check whether all pages are pinnable, if so return number of pages. If some * pages are not pinnable, migrate them, and unpin all pages. Return zero if @@ -1885,6 +1924,37 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (head == prev_head) continue; prev_head = head; + /* + * Device coherent pages are managed by a driver and should not + * be pinned indefinitely as it prevents the driver moving the + * page. So when trying to pin with FOLL_LONGTERM instead try + * migrating page out of device memory. + */ + if (is_dev_private_or_coherent_page(head)) { + /* + * device private pages will get faulted in during gup + * so it shouldn't be possible to see one here. + */ + WARN_ON_ONCE(is_device_private_page(head)); + WARN_ON_ONCE(PageCompound(head)); + + /* + * migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + if (gup_flags & FOLL_PIN) { + get_page(head); + unpin_user_page(head); + } + + pages[i] = migrate_device_page(head, gup_flags); + if (!pages[i]) { + ret = -EBUSY; + break; + } + continue; + } + /* * If we get a movable page, since we are going to be pinning * these entries, try to move them out if possible. @@ -1916,15 +1986,22 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ - if (list_empty(&movable_page_list) && !isolation_error_count) + if (!ret && list_empty(&movable_page_list) && !isolation_error_count) return nr_pages; - if (gup_flags & FOLL_PIN) { - unpin_user_pages(pages, nr_pages); - } else { - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_pages; i++) + if (!pages[i]) + continue; + else if (gup_flags & FOLL_PIN) + unpin_user_page(pages[i]); + else put_page(pages[i]); + + if (ret && !list_empty(&movable_page_list)) { + putback_movable_pages(&movable_page_list); + return ret; } + if (!list_empty(&movable_page_list)) { ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, @@ -2142,65 +2219,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/** - * get_user_pages_locked() - variant of get_user_pages() - * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * - * It is suitable to replace the form: - * - * mmap_read_lock(mm); - * do_something() - * get_user_pages(mm, ..., pages, NULL); - * mmap_read_unlock(mm); - * - * to: - * - * int locked = 1; - * mmap_read_lock(mm); - * do_something() - * get_user_pages_locked(mm, ..., pages, &locked); - * if (locked) - * mmap_read_unlock(mm); - * - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * - */ -long get_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - /* - * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, - * never directly by the caller, so enforce that: - */ - if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) - return -EINVAL; - - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages_locked); - /* * get_user_pages_unlocked() is suitable to replace the form: * @@ -3143,32 +3161,3 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); - -/* - * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). - * Behavior is the same, except that this one sets FOLL_PIN and rejects - * FOLL_GET. - */ -long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ - if (WARN_ON_ONCE(gup_flags & FOLL_GET)) - return -EINVAL; - - gup_flags |= FOLL_PIN; - return __get_user_pages_locked(current->mm, start, nr_pages, - pages, NULL, locked, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(pin_user_pages_locked); diff --git a/mm/highmem.c b/mm/highmem.c index 762679050c9a0..0cc0c4da7ed9f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -736,11 +736,11 @@ void *page_address(const struct page *page) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { ret = pam->virtual; - goto done; + break; } } } -done: + spin_unlock_irqrestore(&pas->lock, flags); return ret; } @@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual) list_for_each_entry(pam, &pas->lh, list) { if (pam->page == page) { list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - goto done; + break; } } spin_unlock_irqrestore(&pas->lock, flags); } -done: + return; } diff --git a/mm/hmm.c b/mm/hmm.c index bd56641c79d4e..af71aac3140e4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -417,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; unsigned long addr = start; pud_t pud; - int ret = 0; spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); if (!ptl) @@ -466,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, out_unlock: spin_unlock(ptl); - return ret; + return 0; } #else #define hmm_vma_walk_pud NULL diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 406a3c28c0266..09fb65a80e636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -34,11 +34,15 @@ #include #include #include +#include #include #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * By default, transparent hugepage support is disabled in order to avoid * risking an increased memory footprint for applications that are not @@ -1303,7 +1307,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1319,10 +1322,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } /* - * We can only reuse the page if nobody else maps the huge page or it's - * part. + * See do_wp_page(): we can only map the page writable if there are + * no additional references. Note that we always drain the LRU + * pagevecs immediately after adding a THP. */ - if (reuse_swap_page(page)) { + if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + goto unlock_fallback; + if (PageSwapCache(page)) + try_to_free_swap(page); + if (page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1333,6 +1341,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) return VM_FAULT_WRITE; } +unlock_fallback: unlock_page(page); spin_unlock(vmf->ptl); fallback: @@ -1380,39 +1389,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags); - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * We don't mlock() pte-mapped THPs. This way we can avoid - * leaking mlocked pages into non-VM_LOCKED VMAs. - * - * For anon THP: - * - * In most cases the pmd is the only mapping of the page as we - * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for - * writable private mappings in populate_vma_page_range(). - * - * The only scenario when we have the page shared here is if we - * mlocking read-only mapping shared over fork(). We skip - * mlocking such pages. - * - * For file THP: - * - * We can expect PageDoubleMap() to be stable under page lock: - * for file pages we set it in page_add_file_rmap(), which - * requires page to be locked. - */ - - if (PageAnon(page) && compound_mapcount(page) != 1) - goto skip_mlock; - if (PageDoubleMap(page) || !page->mapping) - goto skip_mlock; - if (!trylock_page(page)) - goto skip_mlock; - if (page->mapping && !PageDoubleMap(page)) - mlock_vma_page(page); - unlock_page(page); - } -skip_mlock: page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); @@ -1610,7 +1586,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1766,17 +1742,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } #endif - /* - * Avoid trapping faults against the zero page. The read-only - * data is likely to be read-cached on the local CPU and - * local/remote hits to the zero page are not interesting. - */ - if (prot_numa && is_huge_zero_pmd(*pmd)) - goto unlock; + if (prot_numa) { + struct page *page; + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (is_huge_zero_pmd(*pmd)) + goto unlock; - if (prot_numa && pmd_protnone(*pmd)) - goto unlock; + if (pmd_protnone(*pmd)) + goto unlock; + page = pmd_page(*pmd); + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(page_to_nid(page))) + goto unlock; + } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED @@ -1995,7 +1982,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, set_page_dirty(page); if (!PageReferenced(page) && pmd_young(old_pmd)) SetPageReferenced(page); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); @@ -2129,6 +2116,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); } smp_wmb(); /* make pte visible before pmd */ @@ -2136,7 +2126,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, false); + page_remove_rmap(page + i, vma, false); put_page(page + i); } } @@ -2147,8 +2137,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool do_unlock_page = false; - pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, @@ -2167,44 +2155,14 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, goto out; } -repeat: if (pmd_trans_huge(*pmd)) { - if (!page) { + if (!page) page = pmd_page(*pmd); - /* - * An anonymous page must be locked, to ensure that a - * concurrent reuse_swap_page() sees stable mapcount; - * but reuse_swap_page() is not used on shmem or file, - * and page lock must not be taken when zap_pmd_range() - * calls __split_huge_pmd() while i_mmap_lock is held. - */ - if (PageAnon(page)) { - if (unlikely(!trylock_page(page))) { - get_page(page); - _pmd = *pmd; - spin_unlock(ptl); - lock_page(page); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - unlock_page(page); - put_page(page); - page = NULL; - goto repeat; - } - put_page(page); - } - do_unlock_page = true; - } - } - if (PageMlocked(page)) - clear_page_mlock(page); } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); - if (do_unlock_page) - unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. * They are 3 cases to consider inside __split_huge_pmd_locked(): @@ -2332,8 +2290,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } else { /* head is still on lru (and we have it frozen) */ VM_WARN_ON(!PageLRU(head)); + if (PageUnevictable(tail)) + tail->mlock_count = 0; + else + list_add_tail(&tail->lru, &head->lru); SetPageLRU(tail); - list_add_tail(&tail->lru, &head->lru); } } @@ -2518,54 +2479,6 @@ int total_mapcount(struct page *page) return ret; } -/* - * This calculates accurately how many mappings a transparent hugepage - * has (unlike page_mapcount() which isn't fully accurate). This full - * accuracy is primarily needed to know if copy-on-write faults can - * reuse the page and change the mapping to read-write instead of - * copying them. At the same time this returns the total_mapcount too. - * - * The function returns the highest mapcount any one of the subpages - * has. If the return value is one, even if different processes are - * mapping different subpages of the transparent hugepage, they can - * all reuse it, because each process is reusing a different subpage. - * - * The total_mapcount is instead counting all virtual mappings of the - * subpages. If the total_mapcount is equal to "one", it tells the - * caller all mappings belong to the same "mm" and in turn the - * anon_vma of the transparent hugepage can become the vma->anon_vma - * local one as no other process may be mapping any of the subpages. - * - * It would be more accurate to replace page_mapcount() with - * page_trans_huge_mapcount(), however we only use - * page_trans_huge_mapcount() in the copy-on-write faults where we - * need full accuracy to avoid breaking page pinning, because - * page_trans_huge_mapcount() is slower than page_mapcount(). - */ -int page_trans_huge_mapcount(struct page *page) -{ - int i, ret; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (likely(!PageTransCompound(page))) - return atomic_read(&page->_mapcount) + 1; - - page = compound_head(page); - - ret = 0; - for (i = 0; i < thp_nr_pages(page); i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - ret = max(ret, mapcount); - } - - if (PageDoubleMap(page)) - ret -= 1; - - return ret + compound_mapcount(page); -} - /* Racy check whether the huge page can be split */ bool can_split_huge_page(struct page *page, int *pextra_pins) { @@ -3171,8 +3084,9 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); set_pmd_at(mm, address, pvmw->pmd, pmdswp); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); put_page(page); + trace_set_migration_pmd(address, pmd_val(pmdswp)); } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -3197,14 +3111,14 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); - flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE); if (PageAnon(new)) page_add_anon_rmap(new, vma, mmun_start, true); else - page_add_file_rmap(new, true); + page_add_file_rmap(new, vma, true); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); - if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) - mlock_vma_page(new); + + /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d098..5d9838f9044a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4637,7 +4637,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, vma->vm_page_prot)); } entry = pte_mkyoung(entry); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, shift, vma->vm_flags); return entry; @@ -5014,7 +5013,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page, true); + page_remove_rmap(page, vma, true); spin_unlock(ptl); tlb_remove_page_size(tlb, page, huge_page_size(h)); @@ -5259,7 +5258,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - page_remove_rmap(old_page, true); + page_remove_rmap(old_page, vma, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); set_huge_pte_at(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); @@ -5818,7 +5817,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; goto out; } - folio_copy(page_folio(page), page_folio(*pagep)); + copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } @@ -6172,7 +6172,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned int shift = huge_page_shift(hstate_vma(vma)); old_pte = huge_ptep_modify_prot_start(vma, address, ptep); - pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); + pte = huge_pte_modify(old_pte, newprot); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c540c21e26f5b..791626983c2e1 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -124,9 +124,9 @@ * page of page structs (page 0) associated with the HugeTLB page contains the 4 * page structs necessary to describe the HugeTLB. The only use of the remaining * pages of page structs (page 1 to page 7) is to point to page->compound_head. - * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs + * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs * will be used for each HugeTLB page. This will allow us to free the remaining - * 6 pages to the buddy allocator. + * 7 pages to the buddy allocator. * * Here is how things look after remapping. * @@ -134,30 +134,30 @@ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ * | | | 0 | -------------> | 0 | * | | +-----------+ +-----------+ - * | | | 1 | -------------> | 1 | - * | | +-----------+ +-----------+ - * | | | 2 | ----------------^ ^ ^ ^ ^ ^ - * | | +-----------+ | | | | | - * | | | 3 | ------------------+ | | | | - * | | +-----------+ | | | | - * | | | 4 | --------------------+ | | | - * | PMD | +-----------+ | | | - * | level | | 5 | ----------------------+ | | - * | mapping | +-----------+ | | - * | | | 6 | ------------------------+ | - * | | +-----------+ | - * | | | 7 | --------------------------+ + * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^ + * | | +-----------+ | | | | | | + * | | | 2 | -----------------+ | | | | | + * | | +-----------+ | | | | | + * | | | 3 | -------------------+ | | | | + * | | +-----------+ | | | | + * | | | 4 | ---------------------+ | | | + * | PMD | +-----------+ | | | + * | level | | 5 | -----------------------+ | | + * | mapping | +-----------+ | | + * | | | 6 | -------------------------+ | + * | | +-----------+ | + * | | | 7 | ---------------------------+ * | | +-----------+ * | | * | | * | | * +-----------+ * - * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for + * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for * vmemmap pages and restore the previous mapping relationship. * * For the HugeTLB page of the pud level mapping. It is similar to the former. - * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages. + * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages. * * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures * (e.g. aarch64) provides a contiguous bit in the translation table entries @@ -166,7 +166,13 @@ * * The contiguous bit is used to increase the mapping size at the pmd and pte * (last) level. So this type of HugeTLB page can be optimized only when its - * size of the struct page structs is greater than 2 pages. + * size of the struct page structs is greater than 1 page. + * + * Notice: The head vmemmap page is not freed to the buddy allocator and all + * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see + * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) + * associated with each HugeTLB page. The compound_head() can handle this + * correctly (more details refer to the comment above compound_head()). */ #define pr_fmt(fmt) "HugeTLB: " fmt @@ -175,19 +181,21 @@ /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first - * page of tail page structures. We map the virtual addresses of the remaining - * pages of tail page structures to the first tail page struct, and then free - * these page frames. Therefore, we need to reserve two pages as vmemmap areas. + * page of head page structures. We map the virtual addresses of all the pages + * of tail page structures to the head page struct, and then free these page + * frames. Therefore, we need to reserve one pages as vmemmap areas. */ -#define RESERVE_VMEMMAP_NR 2U +#define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON); +DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON, + hugetlb_free_vmemmap_enabled_key); +EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key); static int __init early_hugetlb_free_vmemmap_param(char *buf) { /* We cannot optimize if a "struct page" crosses page boundaries. */ - if ((!is_power_of_2(sizeof(struct page)))) { + if (!is_power_of_2(sizeof(struct page))) { pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); return 0; } @@ -196,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf) return -EINVAL; if (!strcmp(buf, "on")) - hugetlb_free_vmemmap_enabled = true; + static_branch_enable(&hugetlb_free_vmemmap_enabled_key); else if (!strcmp(buf, "off")) - hugetlb_free_vmemmap_enabled = false; + static_branch_disable(&hugetlb_free_vmemmap_enabled_key); else return -EINVAL; @@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head) */ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); - if (!ret) ClearHPageVmemmapOptimized(head); @@ -277,14 +284,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h) BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - if (!hugetlb_free_vmemmap_enabled) + if (!hugetlb_free_vmemmap_enabled()) return; vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; /* - * The head page and the first tail page are not to be freed to buddy - * allocator, the other pages will map to the first tail page, so they - * can be freed. + * The head page is not to be freed to buddy allocator, the other tail + * pages will map to the head page, so they can be freed. * * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true * on some architectures (e.g. aarch64). See Documentation/arm64/ diff --git a/mm/internal.h b/mm/internal.h index d80300392a194..9a5674bd0a742 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -71,11 +71,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); -static inline bool can_madv_lru_vma(struct vm_area_struct *vma) -{ - return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); -} - struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -398,32 +393,42 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, extern long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool write, int *locked); -extern void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end); -static inline void munlock_vma_pages_all(struct vm_area_struct *vma) -{ - munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); -} - -/* - * must be called with vma's mmap_lock held for read or write, and page locked. - */ -extern void mlock_vma_page(struct page *page); -extern unsigned int munlock_vma_page(struct page *page); - extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); - /* - * Clear the page's PageMlocked(). This can be useful in a situation where - * we want to unconditionally remove a page from the pagecache -- e.g., - * on truncation or freeing. + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. * - * It is legal to call this function for any page, mlocked or not. - * If called for a page that is still mapped by mlocked vmas, all we do - * is revert to lazy LRU behaviour -- semantics are not broken. + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. */ -extern void clear_page_mlock(struct page *page); +void mlock_page(struct page *page); +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + /* VM_IO check prevents migration from double-counting during mlock */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_IO)) == VM_LOCKED) && + (compound || !PageTransCompound(page))) + mlock_page(page); +} +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -498,8 +503,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void clear_page_mlock(struct page *page) { } -static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } @@ -572,17 +582,6 @@ static inline void mminit_verify_zonelist(void) } #endif /* CONFIG_DEBUG_MEMORY_INIT */ -/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ -#if defined(CONFIG_SPARSEMEM) -extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn); -#else -static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, - unsigned long *end_pfn) -{ -} -#endif /* CONFIG_SPARSEMEM */ - #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 @@ -718,4 +717,6 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 92196562687b6..d9079ec11f313 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) } /* - * The object will be poisoned by kasan_free_pages() or + * The object will be poisoned by kasan_poison_pages() or * kasan_slab_free_mempool(). */ @@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, return NULL; /* - * The object has already been unpoisoned by kasan_alloc_pages() for + * The object has already been unpoisoned by kasan_unpoison_pages() for * alloc_pages() or by kasan_krealloc() for krealloc(). */ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 7355cb534e4f8..fad1887e54c05 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -32,6 +32,12 @@ enum kasan_arg_mode { KASAN_ARG_MODE_ASYMM, }; +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -40,18 +46,28 @@ enum kasan_arg_stacktrace { static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; -/* Whether KASAN is enabled at all. */ +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); EXPORT_SYMBOL(kasan_flag_enabled); -/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/ +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ enum kasan_mode kasan_mode __ro_after_init; EXPORT_SYMBOL_GPL(kasan_mode); +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + /* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); /* kasan=off/on */ static int __init early_kasan_flag(char *arg) @@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg) } early_param("kasan.mode", early_kasan_mode); +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + /* kasan.stacktrace=off/on */ static int __init early_kasan_flag_stacktrace(char *arg) { @@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void) return "sync"; } -/* kasan_init_hw_tags_cpu() is called for each CPU. */ +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ void kasan_init_hw_tags_cpu(void) { /* @@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void) * as this function is only called for MTE-capable hardware. */ - /* If KASAN is disabled via command line, don't initialize it. */ + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ if (kasan_arg == KASAN_ARG_OFF) return; @@ -151,79 +191,156 @@ void __init kasan_init_hw_tags(void) if (kasan_arg == KASAN_ARG_OFF) return; - /* Enable KASAN. */ - static_branch_enable(&kasan_flag_enabled); - switch (kasan_arg_mode) { case KASAN_ARG_MODE_DEFAULT: - /* - * Default to sync mode. - */ - fallthrough; + /* Default is specified by kasan_mode definition. */ + break; case KASAN_ARG_MODE_SYNC: - /* Sync mode enabled. */ kasan_mode = KASAN_MODE_SYNC; break; case KASAN_ARG_MODE_ASYNC: - /* Async mode enabled. */ kasan_mode = KASAN_MODE_ASYNC; break; case KASAN_ARG_MODE_ASYMM: - /* Asymm mode enabled. */ kasan_mode = KASAN_MODE_ASYMM; break; } + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default to enabling stack trace collection. */ - static_branch_enable(&kasan_flag_stacktrace); + /* Default is specified by kasan_flag_stacktrace definition. */ break; case KASAN_ARG_STACKTRACE_OFF: - /* Do nothing, kasan_flag_stacktrace keeps its default value. */ + static_branch_disable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_ON: static_branch_enable(&kasan_flag_stacktrace); break; } - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n", + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", kasan_stack_collection_enabled() ? "on" : "off"); } -void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) { + struct vm_struct *area; + int i; + /* - * This condition should match the one in post_alloc_hook() in - * page_alloc.c. + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. */ - bool init = !want_init_on_free() && want_init_on_alloc(flags); - - if (flags & __GFP_SKIP_KASAN_POISON) - SetPageSkipKASanPoison(page); + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; - if (flags & __GFP_ZEROTAGS) { - int i; + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); - } else { - kasan_unpoison_pages(page, order, init); + page_kasan_tag_set(page, tag); } } -void kasan_free_pages(struct page *page, unsigned int order) +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) { + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!kasan_vmalloc_enabled()) + return (void *)start; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC + * mappings as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + return (void *)start; + /* - * This condition should match the one in free_pages_prepare() in - * page_alloc.c. + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. */ - bool init = want_init_on_free(); + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; - kasan_poison_pages(page, order, init); + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; } +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ +} + +#endif + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_enable_tagging_sync(void) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index c17fa8d26ffe5..4d67408e84076 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -12,7 +12,8 @@ #include #include "../slab.h" -DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -22,6 +23,11 @@ enum kasan_mode { extern enum kasan_mode kasan_mode __ro_after_init; +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + static inline bool kasan_stack_collection_enabled(void) { return static_branch_unlikely(&kasan_flag_stacktrace); @@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ -#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ +#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ #else #define KASAN_FREE_PAGE KASAN_TAG_INVALID #define KASAN_PAGE_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID #define KASAN_KMALLOC_FREE KASAN_TAG_INVALID -#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */ #endif +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */ #define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ -#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */ /* * Stack redzone shadow values @@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void) #define KASAN_ABI_VERSION 1 #endif +#endif /* CONFIG_KASAN_GENERIC */ + /* Metadata layout customization. */ #define META_BYTES_PER_BLOCK 1 #define META_BLOCKS_PER_ROW 16 diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3ad9624dcc561..f14146563d412 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -117,16 +117,8 @@ static void end_report(unsigned long *flags, unsigned long addr) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) { - /* - * This thread may hit another WARN() in the panic path. - * Resetting this prevents additional WARN() from panicking the - * system on this thread. Other threads are blocked by the - * panic_mutex in panic(). - */ - panic_on_warn = 0; + if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) panic("panic_on_warn set ...\n"); - } if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); kasan_enable_current(); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 94136f84b4497..7272e248db87d 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) return 0; } -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_GRANULE_SIZE); - kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison(start, size, false); -} - static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { @@ -496,9 +475,47 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, } } +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); + return (void *)start; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -534,7 +551,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) return -ENOMEM; } -void kasan_free_shadow(const struct vm_struct *vm) +void kasan_free_module_shadow(const struct vm_struct *vm) { if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 5ad40e3add457..13128fa130625 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -47,7 +47,8 @@ static bool kfence_enabled __read_mostly; -static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a22b1af85577d..50dbb815a2a83 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); /* * Especially for non-preemption kernels, ensure the allocation-gate * timer can catch up: after @resched_after, every failed allocation * attempt yields, to ensure the allocation-gate timer is scheduled. */ - resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL); + resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval); do { if (test_cache) alloc = kmem_cache_alloc(test_cache, gfp); @@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test) int i; /* Skip if we think it'd take too long. */ - KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100); + KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100); setup_test_cache(test, size, 0, NULL); buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); @@ -739,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test) * 100x the sample interval should be more than enough to ensure we get * a KFENCE allocation eventually. */ - timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL); + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); do { void *objects[100]; int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 131492fd1148b..7d45d463acf55 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -46,7 +46,6 @@ enum scan_result { SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, @@ -683,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out; } - if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page)) { - /* - * Page is in the swap cache and cannot be re-used. - * It cannot be collapsed into a THP. - */ - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } /* * Isolate the page to avoid collapsing an hugepage @@ -774,7 +763,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, */ spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + page_remove_rmap(src_page, vma, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -1513,7 +1502,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); } pte_unmap_unlock(start_pte, ptl); @@ -1834,13 +1823,12 @@ static void collapse_file(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_pages(mapping, index, 1, false); + try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); - VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: @@ -1904,6 +1892,13 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); xa_unlocked: + /* + * If collapse is successful, flush must be done now before copying. + * If collapse is unsuccessful, does flush actually need to be done? + * Do it anyway, to clear the state. + */ + try_to_unmap_flush(); + if (result == SCAN_SUCCEED) { struct page *page, *tmp; diff --git a/mm/ksm.c b/mm/ksm.c index c20bd4d9a0d9e..e246d650266ac 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1177,7 +1177,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -1252,16 +1252,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, err = replace_page(vma, page, kpage, orig_pte); } - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - out_unlock: unlock_page(page); out: @@ -2595,6 +2585,9 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); __SetPageLocked(new_page); +#ifdef CONFIG_SWAP + count_vm_event(KSM_SWPIN_COPY); +#endif } return new_page; diff --git a/mm/maccess.c b/mm/maccess.c index d3f1a1f0b1c1a..de43f29c52f0a 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -335,3 +335,9 @@ long strnlen_user_nofault(const void __user *unsafe_addr, long count) return ret; } + +void copy_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} +EXPORT_SYMBOL_GPL(copy_overflow); diff --git a/mm/madvise.c b/mm/madvise.c index 5604064df4646..ae35d72627efa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -530,6 +530,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)); +} + static long madvise_cold(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09d342c7cbd0d..a0e9d9f12cf5b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -254,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) } #ifdef CONFIG_MEMCG_KMEM -extern spinlock_t css_set_lock; +static DEFINE_SPINLOCK(objcg_lock); bool mem_cgroup_kmem_disabled(void) { @@ -298,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref) if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); - spin_lock_irqsave(&css_set_lock, flags); + spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); - spin_unlock_irqrestore(&css_set_lock, flags); + spin_unlock_irqrestore(&objcg_lock, flags); percpu_ref_exit(ref); kfree_rcu(objcg, rcu); @@ -332,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, objcg = rcu_replace_pointer(memcg->objcg, NULL, true); - spin_lock_irq(&css_set_lock); + spin_lock_irq(&objcg_lock); /* 1) Ready to reparent active objcg. */ list_add(&objcg->list, &memcg->objcg_list); @@ -342,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg, /* 3) Move already reparented objcgs to the parent's list */ list_splice(&memcg->objcg_list, &parent->objcg_list); - spin_unlock_irq(&css_set_lock); + spin_unlock_irq(&objcg_lock); percpu_ref_kill(&objcg->refcnt); } @@ -1257,8 +1257,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added - * to or just after a page is removed from an lru list (that ordering being - * so as to allow it to check that lru_size 0 is consistent with list_empty). + * to or just after a page is removed from an lru list. */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages) @@ -1371,6 +1370,7 @@ struct memory_stat { static const struct memory_stat memory_stats[] = { { "anon", NR_ANON_MAPPED }, { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, { "kernel_stack", NR_KERNEL_STACK_KB }, { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, @@ -2114,6 +2114,7 @@ static DEFINE_MUTEX(percpu_charge_mutex); static void drain_obj_stock(struct obj_stock *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); #else static inline void drain_obj_stock(struct obj_stock *stock) @@ -2124,6 +2125,9 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, { return false; } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} #endif /** @@ -2575,7 +2579,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, * put the burden of reclaim on regular allocation requests * and let these go through as privileged allocations. */ - if (gfp_mask & __GFP_ATOMIC) + if (gfp_mask & __GFP_HIGH) goto force; /* @@ -2688,7 +2692,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, READ_ONCE(memcg->swap.high); /* Don't bother a random interrupted task */ - if (in_interrupt()) { + if (!in_task()) { if (mem_high) { schedule_work(&memcg->high_work); break; @@ -2979,6 +2983,18 @@ static void memcg_free_cache_id(int id) ida_simple_remove(&memcg_cache_ida, id); } +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); + } +} + + /* * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg * @objcg: object cgroup to uncharge @@ -2991,8 +3007,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, memcg = get_mem_cgroup_from_objcg(objcg); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, -nr_pages); refill_stock(memcg, nr_pages); css_put(&memcg->css); @@ -3018,8 +3033,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_charge(&memcg->kmem, nr_pages); + memcg_account_kmem(memcg, nr_pages); out: css_put(&memcg->css); @@ -5067,18 +5081,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - int tmp = node; - /* - * This routine is called against possible nodes. - * But it's BUG to call kmalloc() against offline node. - * - * TODO: this routine can waste much memory for nodes which will - * never be onlined. It's better to use memory hotplug callback - * function. - */ - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); if (!pn) return 1; @@ -5090,8 +5094,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); - pn->usage_in_excess = 0; - pn->on_tree = false; pn->memcg = memcg; memcg->nodeinfo[node] = pn; @@ -5691,8 +5693,8 @@ static int mem_cgroup_move_account(struct page *page, * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5726,7 +5728,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_dev_private_or_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; @@ -6801,8 +6803,8 @@ static void uncharge_batch(const struct uncharge_gather *ug) page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); if (do_memsw_account()) page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) - page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); + if (ug->nr_kmem) + memcg_account_kmem(ug->memcg, -ug->nr_kmem); memcg_oom_recover(ug->memcg); } @@ -6968,7 +6970,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; /* Do not associate the sock with unrelated interrupted task's memcg. */ - if (in_interrupt()) + if (!in_task()) return; rcu_read_lock(); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 97a9ed8f87a96..55edb0cc38483 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -707,8 +707,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, (void *)&priv); if (ret == 1 && priv.tk.addr) kill_proc(&priv.tk, pfn, flags); + else + ret = 0; mmap_read_unlock(p->mm); - return ret ? -EFAULT : -EHWPOISON; + return ret > 0 ? -EHWPOISON : -EFAULT; } static const char *action_name[] = { @@ -1617,12 +1619,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: /* - * TODO: Handle HMM pages which may need coordination + * TODO: Handle device pages which may need coordination * with device-side memory. */ goto unlock; + default: + break; } /* @@ -2150,12 +2156,6 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - /* - * Check PageHWPoison again inside page lock because PageHWPoison - * is set by memory_failure() outside page lock. Note that - * memory_failure() also double-checks PageHWPoison inside page lock, - * so there's no race between soft_offline_page() and memory_failure(). - */ lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); diff --git a/mm/memory.c b/mm/memory.c index c125c4969913a..7ace240802d85 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, ptep, pte); - if (vma->vm_flags & VM_LOCKED) - mlock_vma_page(page); - /* * No need to invalidate - it was non-present before. However * secondary CPUs may have mappings that need invalidating. @@ -1377,7 +1374,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { @@ -1397,10 +1394,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; - if (is_device_private_entry(entry)) - page_remove_rmap(page, false); - + page_remove_rmap(page, vma, false); put_page(page); continue; } @@ -1753,16 +1748,16 @@ static int validate_page_before_insert(struct page *page) return 0; } -static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -1776,7 +1771,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { - struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; @@ -1785,17 +1779,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, if (retval) goto out; retval = -ENOMEM; - pte = get_locked_pte(mm, addr, &ptl); + pte = get_locked_pte(vma->vm_mm, addr, &ptl); if (!pte) goto out; - retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1805,7 +1799,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, err = validate_page_before_insert(page); if (err) return err; - return insert_page_into_pte_locked(mm, pte, addr, page, prot); + return insert_page_into_pte_locked(vma, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1842,7 +1836,7 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pte, + int err = insert_page_in_batch_locked(vma, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); @@ -3098,7 +3092,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); } /* Free the old page.. */ @@ -3118,16 +3112,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (page_copied && (vma->vm_flags & VM_LOCKED)) { - lock_page(old_page); /* LRU manipulation */ - if (PageMlocked(old_page)) - munlock_vma_page(old_page); - unlock_page(old_page); - } if (page_copied) free_swap_cache(old_page); put_page(old_page); @@ -3291,19 +3275,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (PageAnon(vmf->page)) { struct page *page = vmf->page; - /* PageKsm() doesn't necessarily raise the page refcount */ - if (PageKsm(page) || page_count(page) != 1) + /* + * We have to verify under page lock: these early checks are + * just an optimization to avoid locking the page and freeing + * the swapcache if there is little hope that we can reuse. + * + * PageKsm() doesn't necessarily raise the page refcount. + */ + if (PageKsm(page) || page_count(page) > 3) + goto copy; + if (!PageLRU(page)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to PageLRU() pages. + */ + lru_add_drain(); + if (page_count(page) > 1 + PageSwapCache(page)) goto copy; if (!trylock_page(page)) goto copy; - if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { + if (PageSwapCache(page)) + try_to_free_swap(page); + if (PageKsm(page) || page_count(page) != 1) { unlock_page(page); goto copy; } /* - * Ok, we've got the only map reference, and the only - * page count reference, and the page is locked, - * it's dark out, and we're wearing sunglasses. Hit it. + * Ok, we've got the only page reference from our mapping + * and the page is locked, it's dark out, and we're wearing + * sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); @@ -3481,6 +3481,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } +static inline bool should_try_to_free_swap(struct page *page, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!PageSwapCache(page)) + return false; + if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || + PageMlocked(page)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + page_count(page) == 2; +} + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -3599,21 +3618,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - /* - * Make sure try_to_free_swap or reuse_swap_page or swapoff did not - * release the swapcache from under us. The page pin, and pte_same - * test below, are not enough to exclude that. Even if it is still - * swapcache, we need to check that the page's swap has not changed. - */ - if (unlikely((!PageSwapCache(page) || - page_private(page) != entry.val)) && swapcache) - goto out_page; - - page = ksm_might_need_to_copy(page, vma, vmf->address); - if (unlikely(!page)) { - ret = VM_FAULT_OOM; - page = swapcache; - goto out_page; + if (swapcache) { + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did + * not release the swapcache from under us. The page pin, and + * pte_same test below, are not enough to exclude that. Even if + * it is still swapcache, we need to check that the page's swap + * has not changed. + */ + if (unlikely(!PageSwapCache(page) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + goto out_page; + } + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + !PageKsm(page) && !PageLRU(page)) + lru_add_drain(); } cgroup_throttle_swaprate(page, GFP_KERNEL); @@ -3632,19 +3669,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* - * The page isn't present yet, go ahead with the fault. - * - * Be careful about the sequence of operations here. - * To get its accounting right, reuse_swap_page() must be called - * while the page is counted on swap but not yet in mapcount i.e. - * before page_add_anon_rmap() and swap_free(); try_to_free_swap() - * must be called after the swap_free(), or it will never succeed. + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. */ + swap_free(entry); + if (should_try_to_free_swap(page, vma, vmf->flags)) + try_to_free_swap(page); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + + /* + * Same logic as in do_wp_page(); however, optimize for fresh pages + * that are certainly not shared because we just allocated them without + * exposing them to the swapcache. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) && + (page != swapcache || page_count(page) == 1)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3670,10 +3713,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - swap_free(entry); - if (mem_cgroup_swap_full(page) || - (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) - try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* @@ -3947,7 +3986,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); - page_add_file_rmap(page, true); + page_add_file_rmap(page, vma, true); + /* * deposit and withdraw with pmd lock held */ @@ -3996,7 +4036,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); + page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } @@ -5444,6 +5484,8 @@ long copy_huge_page_from_user(struct page *dst_page, if (rc) break; + flush_dcache_page(subpage); + cond_resched(); } return ret_val; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a9627dc784c3..ce68098832aa9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -295,12 +295,6 @@ struct page *pfn_to_online_page(unsigned long pfn) } EXPORT_SYMBOL_GPL(pfn_to_online_page); -/* - * Reasonably generic function for adding memory. It is - * expected that archs that support memory hotplug will - * call this function after deciding the zone to which to - * add the new pages. - */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, struct mhp_params *params) { @@ -829,7 +823,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn struct pglist_data *pgdat = NODE_DATA(nid); int zid; - for (zid = 0; zid <= ZONE_NORMAL; zid++) { + for (zid = 0; zid < ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid]; if (zone_intersects(zone, start_pfn, nr_pages)) @@ -1162,43 +1156,20 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid) +static pg_data_t __ref *hotadd_init_pgdat(int nid) { struct pglist_data *pgdat; + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ pgdat = NODE_DATA(nid); - if (!pgdat) { - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; - - pgdat->per_cpu_nodestats = - alloc_percpu(struct per_cpu_nodestat); - arch_refresh_nodedata(nid, pgdat); - } else { - int cpu; - /* - * Reset the nr_zones, order and highest_zoneidx before reuse. - * Note that kswapd will init kswapd_highest_zoneidx properly - * when it starts in the near future. - */ - pgdat->nr_zones = 0; - pgdat->kswapd_order = 0; - pgdat->kswapd_highest_zoneidx = 0; - for_each_online_cpu(cpu) { - struct per_cpu_nodestat *p; - - p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); - memset(p, 0, sizeof(*p)); - } - } - - /* we can use NODE_DATA(nid) from here */ - pgdat->node_id = nid; - pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_core_hotplug(nid); + free_area_init_core_hotplug(pgdat); /* * The node we allocated has no zone fallback lists. For avoiding @@ -1210,6 +1181,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid) * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). + * TODO: should be in free_area_init_core_hotplug? */ reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); @@ -1217,16 +1189,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid) return pgdat; } -static void rollback_node_hotadd(int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - - arch_refresh_nodedata(nid, NULL); - free_percpu(pgdat->per_cpu_nodestats); - arch_free_nodedata(pgdat); -} - - /* * __try_online_node - online a node if offlined * @nid: the node ID @@ -1246,7 +1208,7 @@ static int __try_online_node(int nid, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid); + pgdat = hotadd_init_pgdat(nid); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1327,7 +1289,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !hugetlb_free_vmemmap_enabled && + !hugetlb_free_vmemmap_enabled() && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && @@ -1445,9 +1407,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: - /* rollback pgdat allocation and others */ - if (new_node) - rollback_node_hotadd(nid); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: @@ -2004,6 +1963,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, return 0; failed_removal_isolated: + /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal_pcplists_disabled: @@ -2014,7 +1974,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - /* pushback to free area */ mem_hotplug_done(); return ret; } @@ -2046,12 +2005,12 @@ static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) return mem->nr_vmemmap_pages; } -static int check_cpu_on_node(pg_data_t *pgdat) +static int check_cpu_on_node(int nid) { int cpu; for_each_present_cpu(cpu) { - if (cpu_to_node(cpu) == pgdat->node_id) + if (cpu_to_node(cpu) == nid) /* * the cpu on this node isn't removed, and we can't * offline this node. @@ -2085,7 +2044,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) */ void try_offline_node(int nid) { - pg_data_t *pgdat = NODE_DATA(nid); int rc; /* @@ -2093,7 +2051,7 @@ void try_offline_node(int nid) * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined. */ - if (pgdat->node_spanned_pages) + if (node_spanned_pages(nid)) return; /* @@ -2105,7 +2063,7 @@ void try_offline_node(int nid) if (rc) return; - if (check_cpu_on_node(pgdat)) + if (check_cpu_on_node(nid)) return; /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 028e8dd82b442..7c852793d9e85 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,7 +123,7 @@ enum zone_type policy_zone = 0; * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { - .refcnt = ATOMIC_INIT(1), /* never free it */ + .refcnt = { ATOMIC_INIT(1), }, /* never free it */ .mode = MPOL_LOCAL, }; @@ -295,7 +295,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); - atomic_set(&policy->refcnt, 1); + refcount_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; policy->home_node = NUMA_NO_NODE; @@ -306,7 +306,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!refcount_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } @@ -907,17 +907,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; - int err; + int ret; - int locked = 1; - err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err > 0) { - err = page_to_nid(p); + ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); + if (ret > 0) { + ret = page_to_nid(p); put_page(p); } - if (locked) - mmap_read_unlock(mm); - return err; + return ret; } /* Retrieve NUMA policy */ @@ -968,14 +965,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* - * Take a refcount on the mpol, lookup_node() - * will drop the mmap_lock, so after calling - * lookup_node() only "pol" remains valid, "vma" - * is stale. + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); + mmap_read_unlock(mm); err = lookup_node(mm, addr); if (err < 0) goto out; @@ -2409,7 +2406,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } - atomic_set(&new->refcnt, 1); + refcount_set(&new->refcnt, 1); return new; } @@ -2706,7 +2703,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, goto alloc_new; *mpol_new = *n->policy; - atomic_set(&mpol_new->refcnt, 1); + refcount_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); @@ -2900,7 +2897,7 @@ void __init numa_policy_init(void) for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { - .refcnt = ATOMIC_INIT(1), + .refcnt = { ATOMIC_INIT(1), }, .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .nodes = nodemask_of_node(nid), diff --git a/mm/memremap.c b/mm/memremap.c index 6aa5f0c2d11fd..71b8d42d820c0 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key); static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT || pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_dec(&devmap_managed_key); } @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap) static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT || pgmap->type == MEMORY_DEVICE_FS_DAX) static_branch_inc(&devmap_managed_key); } @@ -115,6 +117,26 @@ static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } +/* + * This returns true if the page is reserved by ZONE_DEVICE driver. + */ +bool pfn_zone_device_reserved(unsigned long pfn) +{ + struct dev_pagemap *pgmap; + struct vmem_altmap *altmap; + bool ret = false; + + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ret; + altmap = pgmap_altmap(pgmap); + if (altmap && pfn < (altmap->base_pfn + altmap->reserve)) + ret = true; + put_dev_pagemap(pgmap); + + return ret; +} + #define for_each_device_pfn(pfn, map, i) \ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ pfn = pfn_next(map, pfn)) @@ -282,7 +304,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, return 0; err_add_memory: - kasan_remove_zero_shadow(__va(range->start), range_len(range)); + if (!is_private) + kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); err_pfn_remap: @@ -327,6 +350,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { @@ -469,7 +502,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); void free_devmap_managed_page(struct page *page) { /* notify page idle for dax */ - if (!is_device_private_page(page)) { + if (!is_dev_private_or_coherent_page(page)) { wake_up_var(&page->_refcount); return; } diff --git a/mm/migrate.c b/mm/migrate.c index c7da064b4781b..23b5ac8892b4e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -51,10 +51,10 @@ #include #include #include +#include #include -#define CREATE_TRACE_POINTS #include #include "internal.h" @@ -248,14 +248,14 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else - page_add_file_rmap(new, false); + page_add_file_rmap(new, vma, false); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) - mlock_vma_page(new); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); - if (PageTransHuge(page) && PageMlocked(page)) - clear_page_mlock(page); + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); @@ -345,7 +345,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * Device private pages have an extra refcount as they are * ZONE_DEVICE pages. */ - expected_count += is_device_private_page(page); + expected_count += is_dev_private_or_coherent_page(page); if (mapping) expected_count += compound_nr(page) + page_has_private(page); @@ -917,8 +917,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, page->mapping = NULL; if (likely(!is_zone_device_page(newpage))) - flush_dcache_page(newpage); - + flush_dcache_folio(page_folio(newpage)); } out: return rc; @@ -1037,6 +1036,21 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (!page_mapped(page)) rc = move_to_new_page(newpage, page, mode); + /* + * When successful, push newpage to LRU immediately: so that if it + * turns out to be an mlocked page, remove_migration_ptes() will + * automatically build up the correct newpage->mlock_count for it. + * + * We would like to do something similar for the old page, when + * unsuccessful, and other cases when a page has been temporarily + * isolated from the unevictable LRU: but this case is the easiest. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + lru_cache_add(newpage); + if (page_was_mapped) + lru_add_drain(); + } + if (page_was_mapped) remove_migration_ptes(page, rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); @@ -1050,20 +1064,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, unlock_page(page); out: /* - * If migration is successful, decrease refcount of the newpage + * If migration is successful, decrease refcount of the newpage, * which will not free the page because new page owner increased - * refcounter. As well, if it is LRU page, add the page to LRU - * list in here. Use the old state of the isolated source page to - * determine if we migrated a LRU page. newpage was already unlocked - * and possibly modified by its owner - don't rely on the page - * state. + * refcounter. */ - if (rc == MIGRATEPAGE_SUCCESS) { - if (unlikely(!is_lru)) - put_page(newpage); - else - putback_lru_page(newpage); - } + if (rc == MIGRATEPAGE_SUCCESS) + put_page(newpage); return rc; } @@ -1761,6 +1767,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, continue; } + /* + * The move_pages() man page does not have an -EEXIST choice, so + * use -EFAULT instead. + */ + if (err == -EEXIST) + err = -EFAULT; + /* * If the page is already on the target node (!err), store the * node, otherwise, store the err. @@ -2034,16 +2047,30 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; int nr_pages = thp_nr_pages(page); + int order = compound_order(page); - VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, nr_pages)) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) || + !numa_demotion_enabled) + return 0; + if (next_demotion_node(pgdat->node_id) == NUMA_NO_NODE) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (populated_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; + } if (isolate_lru_page(page)) return 0; @@ -2072,6 +2099,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; + int nr_succeeded; LIST_HEAD(migratepages); new_page_t *new; bool compound; @@ -2110,7 +2138,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, *new, NULL, node, - MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL); + MIGRATE_ASYNC, MR_NUMA_MISPLACED, + &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); @@ -2119,8 +2148,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, putback_lru_page(page); } isolated = 0; - } else - count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages); + } + if (nr_succeeded) { + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); + if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + mod_node_page_state(NODE_DATA(node), PGPROMOTE_SUCCESS, + nr_succeeded); + } BUG_ON(!list_empty(&migratepages)); return isolated; @@ -2264,15 +2298,21 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; goto next; } page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2331,7 +2371,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * drop page refcount. Page won't be freed, as we took * a reference just above. */ - page_remove_rmap(page, false); + page_remove_rmap(page, vma, false); put_page(page); if (pte_present(pte)) @@ -2573,24 +2613,24 @@ int migrate_vma_setup(struct migrate_vma *args) args->start &= PAGE_MASK; args->end &= PAGE_MASK; - if (!args->vma || is_vm_hugetlb_page(args->vma) || - (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) - return -EINVAL; - if (nr_pages <= 0) - return -EINVAL; - if (args->start < args->vma->vm_start || - args->start >= args->vma->vm_end) - return -EINVAL; - if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) - return -EINVAL; if (!args->src || !args->dst) return -EINVAL; - - memset(args->src, 0, sizeof(*args->src) * nr_pages); - args->cpages = 0; - args->npages = 0; - - migrate_vma_collect(args); + if (args->vma) { + if (is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + } if (args->cpages) migrate_vma_unmap(args); @@ -2611,7 +2651,7 @@ EXPORT_SYMBOL(migrate_vma_setup); * handle_pte_fault() * do_anonymous_page() * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * private or coherent page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -2676,25 +2716,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, */ __SetPageUptodate(page); - if (is_zone_device_page(page)) { - if (is_device_private_page(page)) { - swp_entry_t swp_entry; + if (is_device_private_page(page)) { + swp_entry_t swp_entry; - if (vma->vm_flags & VM_WRITE) - swp_entry = make_writable_device_private_entry( - page_to_pfn(page)); - else - swp_entry = make_readable_device_private_entry( - page_to_pfn(page)); - entry = swp_entry_to_pte(swp_entry); - } else { - /* - * For now we only support migrating to un-addressable - * device memory. - */ - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); - goto abort; - } + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { + /* + * We support migrating to private and coherent types + * for device zone memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } else { entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) @@ -2776,7 +2815,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - if (!page) { + if (!page && migrate->vma) { if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { @@ -2796,10 +2835,10 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); if (is_zone_device_page(newpage)) { - if (is_device_private_page(newpage)) { + if (is_dev_private_or_coherent_page(newpage)) { /* - * For now only support private anonymous when - * migrating to un-addressable device memory. + * For now only support private and coherent + * anonymous when migrating to device memory. */ if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; @@ -3082,18 +3121,21 @@ static int establish_migrate_target(int node, nodemask_t *used, if (best_distance != -1) { val = node_distance(node, migration_target); if (val > best_distance) - return NUMA_NO_NODE; + goto out_clear; } index = nd->nr; if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, "Exceeds maximum demotion target nodes\n")) - return NUMA_NO_NODE; + goto out_clear; nd->nodes[index] = migration_target; nd->nr++; return migration_target; +out_clear: + node_clear(migration_target, *used); + return NUMA_NO_NODE; } /* diff --git a/mm/mlock.c b/mm/mlock.c index 8f584eddd3053..d50d48961b228 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,8 @@ #include "internal.h" +static DEFINE_PER_CPU(struct pagevec, mlock_pvec); + bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) @@ -46,441 +49,312 @@ EXPORT_SYMBOL(can_do_mlock); * be placed on the LRU "unevictable" list, rather than the [in]active lists. * The unevictable list is an LRU sibling list to the [in]active lists. * PageUnevictable is set to indicate the unevictable state. - * - * When lazy mlocking via vmscan, it is important to ensure that the - * vma's VM_LOCKED status is not concurrently being modified, otherwise we - * may have mlocked a page that is being munlocked. So lazy mlock must take - * the mmap_lock for read, and verify that the vma really is locked - * (see mm/rmap.c). */ -/* - * LRU accounting for clear_page_mlock() - */ -void clear_page_mlock(struct page *page) +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) { - int nr_pages; + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; - if (!TestClearPageMlocked(page)) - return; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); - /* - * The previous TestClearPageMlocked() corresponds to the smp_mb() - * in __pagevec_lru_add_fn(). - * - * See __pagevec_lru_add_fn for more explanation. - */ - if (!isolate_lru_page(page)) { - putback_lru_page(page); - } else { + if (unlikely(page_evictable(page))) { /* - * We lost the race. the page already moved to evictable list. + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. */ - if (PageUnevictable(page)) - count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; } + + if (PageUnevictable(page)) { + if (PageMlocked(page)) + page->mlock_count++; + goto out; + } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + SetPageLRU(page); + return lruvec; } -/* - * Mark page as mlocked if not already. - * If page on LRU, isolate and putback to move to unevictable list. - */ -void mlock_vma_page(struct page *page) +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) { - /* Serialize with page migration */ - BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); - if (!TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - if (!isolate_lru_page(page)) - putback_lru_page(page); - } + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; } -/* - * Finish munlock after successful page isolation - * - * Page must be locked. This is a wrapper for page_mlock() - * and putback_lru_page() with munlock accounting. - */ -static void __munlock_isolated_page(struct page *page) +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) { - /* - * Optimization: if the page was mapped just once, that's our mapping - * and we don't need to check all the other vmas. - */ - if (page_mapcount(page) > 1) - page_mlock(page); + int nr_pages = thp_nr_pages(page); + bool isolated = false; - /* Did try_to_unlock() succeed or punt? */ - if (!PageMlocked(page)) - count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); + if (!TestClearPageLRU(page)) + goto munlock; - putback_lru_page(page); + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + +munlock: + if (TestClearPageMlocked(page)) { + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (isolated || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } +out: + if (isolated) + SetPageLRU(page); + return lruvec; } /* - * Accounting for page isolation fail during munlock - * - * Performs accounting when page isolation fails in munlock. There is nothing - * else to do because it means some other task has already removed the page - * from the LRU. putback_lru_page() will take care of removing the page from - * the unevictable list, if necessary. vmscan [page_referenced()] will move - * the page back to the unevictable list if some other vma has it mlocked. + * Flags held in the low bits of a struct page pointer on the mlock_pvec. */ -static void __munlock_isolation_failed(struct page *page) -{ - int nr_pages = thp_nr_pages(page); - - if (PageUnevictable(page)) - __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); - else - __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); -} +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +#define mlock_lru(page) ((struct page *)((unsigned long)page + LRU_PAGE)) +#define mlock_new(page) ((struct page *)((unsigned long)page + NEW_PAGE)) -/** - * munlock_vma_page - munlock a vma page - * @page: page to be unlocked, either a normal page or THP page head - * - * returns the size of the page as a page mask (0 for normal page, - * HPAGE_PMD_NR - 1 for THP head page) - * - * called from munlock()/munmap() path with page supposedly on the LRU. - * When we munlock a page, because the vma where we found the page is being - * munlock()ed or munmap()ed, we want to check whether other vmas hold the - * page locked so that we can leave it on the unevictable lru list and not - * bother vmscan with it. However, to walk the page's rmap list in - * page_mlock() we must isolate the page from the LRU. If some other - * task has removed the page from the LRU, we won't be able to do that. - * So we clear the PageMlocked as we might not get another chance. If we - * can't isolate the page, we leave it for putback_lru_page() and vmscan - * [page_referenced()/try_to_unmap()] to deal with. +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). */ -unsigned int munlock_vma_page(struct page *page) +static void mlock_pagevec(struct pagevec *pvec) { - int nr_pages; - - /* For page_mlock() and to serialize with page migration */ - BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; - if (!TestClearPageMlocked(page)) { - /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ - return 0; + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); } - nr_pages = thp_nr_pages(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} - if (!isolate_lru_page(page)) - __munlock_isolated_page(page); - else - __munlock_isolation_failed(page); +void mlock_page_drain(int cpu) +{ + struct pagevec *pvec; - return nr_pages - 1; + pvec = &per_cpu(mlock_pvec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); } -/* - * convert get_user_pages() return value to posix mlock() error - */ -static int __mlock_posix_error_return(long retval) +bool need_mlock_page_drain(int cpu) { - if (retval == -EFAULT) - retval = -ENOMEM; - else if (retval == -ENOMEM) - retval = -EAGAIN; - return retval; + return pagevec_count(&per_cpu(mlock_pvec, cpu)); } -/* - * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() - * - * The fast path is available only for evictable pages with single mapping. - * Then we can bypass the per-cpu pvec and get better performance. - * when mapcount > 1 we need page_mlock() which can fail. - * when !page_evictable(), we need the full redo logic of putback_lru_page to - * avoid leaving evictable page in unevictable list. - * - * In case of success, @page is added to @pvec and @pgrescued is incremented - * in case that the page was previously unevictable. @page is also unlocked. +/** + * mlock_page - mlock a page already on (or temporarily off) LRU + * @page - page to be mlocked, either a normal page or a THP head. */ -static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, - int *pgrescued) +void mlock_page(struct page *page) { - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + struct pagevec *pvec = &get_cpu_var(mlock_pvec); - if (page_mapcount(page) <= 1 && page_evictable(page)) { - pagevec_add(pvec, page); - if (TestClearPageUnevictable(page)) - (*pgrescued)++; - unlock_page(page); - return true; + if (!TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } - return false; + get_page(page); + if (!pagevec_add(pvec, mlock_lru(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Putback multiple evictable pages to the LRU - * - * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of - * the pages might have meanwhile become unevictable but that is OK. +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page - page to be mlocked, either a normal page or a THP head. */ -static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +void mlock_new_page(struct page *page) { - count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); - /* - *__pagevec_lru_add() calls release_pages() so we don't call - * put_page() explicitly - */ - __pagevec_lru_add(pvec); - count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + struct pagevec *pvec = &get_cpu_var(mlock_pvec); + int nr_pages = thp_nr_pages(page); + + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Munlock a batch of pages from the same zone - * - * The work is split to two main phases. First phase clears the Mlocked flag - * and attempts to isolate the pages, all under a single zone lru lock. - * The second phase finishes the munlock only for pages where isolation - * succeeded. - * - * Note that the pagevec may be modified during the process. +/** + * munlock_page - munlock a page + * @page - page to be munlocked, either a normal page or a THP head. */ -static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +void munlock_page(struct page *page) { - int i; - int nr = pagevec_count(pvec); - int delta_munlocked = -nr; - struct pagevec pvec_putback; - struct lruvec *lruvec = NULL; - int pgrescued = 0; - - pagevec_init(&pvec_putback); - - /* Phase 1: page isolation */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - - if (TestClearPageMlocked(page)) { - /* - * We already have pin from follow_page_mask() - * so we can spare the get_page() here. - */ - if (TestClearPageLRU(page)) { - lruvec = folio_lruvec_relock_irq(folio, lruvec); - del_page_from_lru_list(page, lruvec); - continue; - } else - __munlock_isolation_failed(page); - } else { - delta_munlocked++; - } - - /* - * We won't be munlocking this page in the next phase - * but we still need to release the follow_page_mask() - * pin. We cannot do it under lru_lock however. If it's - * the last pin, __page_cache_release() would deadlock. - */ - pagevec_add(&pvec_putback, pvec->pages[i]); - pvec->pages[i] = NULL; - } - if (lruvec) { - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - unlock_page_lruvec_irq(lruvec); - } else if (delta_munlocked) { - mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - } - - /* Now we can release pins of pages that we are not munlocking */ - pagevec_release(&pvec_putback); - - /* Phase 2: page munlock */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - - if (page) { - lock_page(page); - if (!__putback_lru_fast_prepare(page, &pvec_putback, - &pgrescued)) { - /* - * Slow path. We don't want to lose the last - * pin before unlock_page() - */ - get_page(page); /* for putback_lru_page() */ - __munlock_isolated_page(page); - unlock_page(page); - put_page(page); /* from follow_page_mask() */ - } - } - } + struct pagevec *pvec = &get_cpu_var(mlock_pvec); /* - * Phase 3: page putback for pages that qualified for the fast path - * This will also call put_page() to return pin from follow_page_mask() + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. */ - if (pagevec_count(&pvec_putback)) - __putback_lru_fast(&pvec_putback, pgrescued); + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + put_cpu_var(mlock_pvec); } -/* - * Fill up pagevec for __munlock_pagevec using pte walk - * - * The function expects that the struct page corresponding to @start address is - * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. - * - * The rest of @pvec is filled by subsequent pages within the same pmd and same - * zone, as long as the pte's are present and vm_normal_page() succeeds. These - * pages also get pinned. - * - * Returns the address of the next page that should be scanned. This equals - * @start + PAGE_SIZE when no page could be added by the pte walk. - */ -static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, - struct vm_area_struct *vma, struct zone *zone, - unsigned long start, unsigned long end) +static int mlock_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + { - pte_t *pte; + struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; + pte_t *start_pte, *pte; + struct page *page; - /* - * Initialize pte walk starting at the already pinned page where we - * are sure that there is a pte, as it was pinned under the same - * mmap_lock write op. - */ - pte = get_locked_pte(vma->vm_mm, start, &ptl); - /* Make sure we do not cross the page table boundary */ - end = pgd_addr_end(start, end); - end = p4d_addr_end(start, end); - end = pud_addr_end(start, end); - end = pmd_addr_end(start, end); - - /* The page next to the pinned page is the first we will try to get */ - start += PAGE_SIZE; - while (start < end) { - struct page *page = NULL; - pte++; - if (pte_present(*pte)) - page = vm_normal_page(vma, start, *pte); - /* - * Break if page could not be obtained or the page's node+zone does not - * match - */ - if (!page || page_zone(page) != zone) - break; + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (!pmd_present(*pmd)) + goto out; + if (is_huge_zero_pmd(*pmd)) + goto out; + page = pmd_page(*pmd); + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); + goto out; + } - /* - * Do not use pagevec for PTE-mapped THP, - * munlock_vma_pages_range() will handle them. - */ + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page) + continue; if (PageTransCompound(page)) - break; - - get_page(page); - /* - * Increase the address that will be returned *before* the - * eventual break due to pvec becoming full by adding the page - */ - start += PAGE_SIZE; - if (pagevec_add(pvec, page) == 0) - break; + continue; + if (vma->vm_flags & VM_LOCKED) + mlock_page(page); + else + munlock_page(page); } - pte_unmap_unlock(pte, ptl); - return start; + pte_unmap(start_pte); +out: + spin_unlock(ptl); + cond_resched(); + return 0; } /* - * munlock_vma_pages_range() - munlock all pages in the vma range.' - * @vma - vma containing range to be munlock()ed. + * mlock_vma_pages_range() - mlock any pages already in the range, + * or munlock all pages in the range. + * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range - * @end - end of range in @vma. - * - * For mremap(), munmap() and exit(). - * - * Called with @vma VM_LOCKED. + * @end - end of range in @vma + * @newflags - the new set of flags for @vma. * - * Returns with VM_LOCKED cleared. Callers must be prepared to - * deal with this. - * - * We don't save and restore VM_LOCKED here because pages are - * still on lru. In unmap path, pages might be scanned by reclaim - * and re-mlocked by page_mlock/try_to_unmap before we unmap and - * free them. This will result in freeing mlocked pages. + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ -void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static void mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t newflags) { - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + static const struct mm_walk_ops mlock_walk_ops = { + .pmd_entry = mlock_pte_range, + }; - while (start < end) { - struct page *page; - unsigned int page_mask = 0; - unsigned long page_increm; - struct pagevec pvec; - struct zone *zone; + /* + * There is a slight chance that concurrent page migration, + * or page reclaim finding a page of this now-VM_LOCKED vma, + * will call mlock_vma_page() and raise page's mlock_count: + * double counting, leaving the page unevictable indefinitely. + * Communicate this danger to mlock_vma_page() with VM_IO, + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. + * mmap_lock is held in write mode here, so this weird + * combination should not be visible to others. + */ + if (newflags & VM_LOCKED) + newflags |= VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); - pagevec_init(&pvec); - /* - * Although FOLL_DUMP is intended for get_dump_page(), - * it just so happens that its special treatment of the - * ZERO_PAGE (returning an error instead of doing get_page) - * suits munlock very well (and if somehow an abnormal page - * has sneaked into the range, we won't oops here: great). - */ - page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); - - if (page && !IS_ERR(page)) { - if (PageTransTail(page)) { - VM_BUG_ON_PAGE(PageMlocked(page), page); - put_page(page); /* follow_page_mask() */ - } else if (PageTransHuge(page)) { - lock_page(page); - /* - * Any THP page found by follow_page_mask() may - * have gotten split before reaching - * munlock_vma_page(), so we need to compute - * the page_mask here instead. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); /* follow_page_mask() */ - } else { - /* - * Non-huge pages are handled in batches via - * pagevec. The pin from follow_page_mask() - * prevents them from collapsing by THP. - */ - pagevec_add(&pvec, page); - zone = page_zone(page); - - /* - * Try to fill the rest of pagevec using fast - * pte walk. This will also update start to - * the next page to process. Then munlock the - * pagevec. - */ - start = __munlock_pagevec_fill(&pvec, vma, - zone, start, end); - __munlock_pagevec(&pvec, zone); - goto next; - } - } - page_increm = 1 + page_mask; - start += page_increm * PAGE_SIZE; -next: - cond_resched(); + lru_add_drain(); + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); + lru_add_drain(); + + if (newflags & VM_IO) { + newflags &= ~VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); } } @@ -500,8 +374,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = !!(newflags & VM_LOCKED); - vm_flags_t old_flags = vma->vm_flags; + vm_flags_t oldflags = vma->vm_flags; if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || @@ -535,9 +408,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; - if (!lock) + if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; - else if (old_flags & VM_LOCKED) + else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; @@ -547,11 +420,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if (lock) + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + /* No work to do, and mlocking twice would be wrong */ vma->vm_flags = newflags; - else - munlock_vma_pages_range(vma, start, end); - + } else { + mlock_vma_pages_range(vma, start, end, newflags); + } out: *prev = vma; return ret; @@ -645,6 +519,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, return count >> PAGE_SHIFT; } +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; diff --git a/mm/mmap.c b/mm/mmap.c index 1e8fdb0b51edd..64b5985b5295c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2674,6 +2674,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = NULL; do { vma_rb_erase(vma, &mm->mm_rb); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(vma); mm->map_count--; tail_vma = vma; vma = vma->vm_next; @@ -2778,22 +2780,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline void -unlock_range(struct vm_area_struct *start, unsigned long limit) -{ - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - - while (tmp && tmp->vm_start < limit) { - if (tmp->vm_flags & VM_LOCKED) { - mm->locked_vm -= vma_pages(tmp); - munlock_vma_pages_all(tmp); - } - - tmp = tmp->vm_next; - } -} - /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2874,12 +2860,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return error; } - /* - * unlock any mlock()ed ranges before detaching vmas - */ - if (mm->locked_vm) - unlock_range(vma, end); - /* Detach vmas from rbtree */ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) downgrade = false; @@ -3147,20 +3127,12 @@ void exit_mmap(struct mm_struct *mm) * Nothing can be holding mm->mmap_lock here and the above call * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. - * - * This needs to be done before calling unlock_range(), - * which clears VM_LOCKED, otherwise the oom reaper cannot - * reliably test it. */ (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); } mmap_write_lock(mm); - if (mm->locked_vm) - unlock_range(mm->mmap, ULONG_MAX); - arch_exit_mmap(mm); vma = mm->mmap; diff --git a/mm/mmzone.c b/mm/mmzone.c index eb89d6e018e29..0ae7571e35abb 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); + /* + * The "Unevictable LRU" is imaginary: though its size is maintained, + * it is never scanned, and unevictable pages are not threaded on it + * (so that their lru fields can be reused to hold mlock_count). + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) @@ -89,13 +96,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid) unsigned long old_flags, flags; int last_cpupid; + old_flags = READ_ONCE(page->flags); do { - old_flags = flags = page->flags; - last_cpupid = page_cpupid_last(page); + flags = old_flags; + last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); return last_cpupid; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 0138dfcdb1d80..2fe03e695c81c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa) { struct page *page; + int nid; /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) @@ -109,7 +111,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - if (target_node == page_to_nid(page)) + nid = page_to_nid(page); + if (target_node == nid) + continue; + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + node_is_toptier(nid)) continue; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 832fb330376ef..6b875acabd1e7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -526,7 +526,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) set_bit(MMF_UNSTABLE, &mm->flags); for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_lru_vma(vma)) + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d163f8d36b2..48a8cf770e3f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -70,30 +70,33 @@ static long ratelimit_pages = 32; /* * Start background writeback (via writeback threads) at this percentage */ -int dirty_background_ratio = 10; +static int dirty_background_ratio = 10; /* * dirty_background_bytes starts at 0 (disabled) so that it is a function of * dirty_background_ratio * the amount of dirtyable memory */ -unsigned long dirty_background_bytes; +static unsigned long dirty_background_bytes; /* * free highmem will not be subtracted from the total free memory * for calculating free ratios if vm_highmem_is_dirtyable is true */ -int vm_highmem_is_dirtyable; +static int vm_highmem_is_dirtyable; /* * The generator of dirty data starts writeback at this percentage */ -int vm_dirty_ratio = 20; +static int vm_dirty_ratio = 20; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* * vm_dirty_bytes starts at 0 (disabled) so that it is a function of * vm_dirty_ratio * the amount of dirtyable memory */ -unsigned long vm_dirty_bytes; +static unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks @@ -503,7 +506,8 @@ bool node_dirty_ok(struct pglist_data *pgdat) return nr_pages <= limit; } -int dirty_background_ratio_handler(struct ctl_table *table, int write, +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -514,7 +518,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, return ret; } -int dirty_background_bytes_handler(struct ctl_table *table, int write, +static int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -525,7 +529,7 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, +static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; @@ -539,7 +543,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } -int dirty_bytes_handler(struct ctl_table *table, int write, +static int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; @@ -552,6 +556,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, } return ret; } +#endif static unsigned long wp_next_time(unsigned long cur_time) { @@ -1993,10 +1998,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return false; } +#ifdef CONFIG_SYSCTL /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, +static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; @@ -2017,6 +2023,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, return ret; } +#endif void laptop_mode_timer_fn(struct timer_list *t) { @@ -2081,6 +2088,79 @@ static int page_writeback_cpu_online(unsigned int cpu) return 0; } +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + /* * Called early on to tune the page writeback dirty limits. * @@ -2105,6 +2185,9 @@ void __init page_writeback_init(void) page_writeback_cpu_online, NULL); cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d319..cface1d380937 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly; */ static DEFINE_STATIC_KEY_TRUE(deferred_pages); -/* - * Calling kasan_poison_pages() only after deferred memory initialization - * has completed. Poisoning pages during deferred memory init will greatly - * lengthen the process and cause problem in large memory systems as the - * deferred pages initialization is done with interrupt disabled. - * - * Assuming that there will be no reference to those newly initialized - * pages before they are ever allocated, this should have no effect on - * KASAN memory tracking as the poison will be properly inserted at page - * allocation time. The only corner case is when pages are allocated by - * on-demand allocation and then freed again before the deferred pages - * initialization is done, but this is not likely to happen. - */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return static_branch_unlikely(&deferred_pages) || - (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return static_branch_unlikely(&deferred_pages); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool deferred_pages_enabled(void) { - return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) || - PageSkipKASanPoison(page); + return false; } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1117,25 +1099,24 @@ static inline void __free_one_page(struct page *page, } if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. - * We want to prevent merge between freepages on isolate - * pageblock and normal pageblock. Without this, pageblock - * isolation could cause incorrect freepage or CMA accounting. + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. * * We don't want to hit this code for the more frequent * low-order merging. */ - if (unlikely(has_isolate_pageblock(zone))) { - int buddy_mt; + int buddy_mt; - buddy_pfn = __find_buddy_pfn(pfn, order); - buddy = page + (buddy_pfn - pfn); - buddy_mt = get_pageblock_migratetype(buddy); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + buddy_mt = get_pageblock_migratetype(buddy); - if (migratetype != buddy_mt - && (is_migrate_isolate(migratetype) || - is_migrate_isolate(buddy_mt))) - goto done_merging; - } + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; max_order = order + 1; goto continue_merging; } @@ -1271,15 +1252,38 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; } -static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - int i; + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} - if (zero_tags) { - for (i = 0; i < numpages; i++) - tag_clear_highpage(page + i); - return; - } +static void kernel_init_free_pages(struct page *page, int numpages) +{ + int i; /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); @@ -1296,7 +1300,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1363,23 +1367,21 @@ static __always_inline bool free_pages_prepare(struct page *page, /* * As memory initialization might be integrated into KASAN, - * kasan_free_pages and kernel_init_free_pages must be + * KASAN poisoning and memory initialization code must be * kept together to avoid discrepancies in behavior. * * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - if (kasan_has_integrated_init()) { - if (!skip_kasan_poison) - kasan_free_pages(page, order); - } else { - bool init = want_init_on_free(); + if (!should_skip_kasan_poison(page, fpi_flags)) { + kasan_poison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, false); - if (!skip_kasan_poison) - kasan_poison_pages(page, order, init); + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; } + if (init) + kernel_init_free_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2392,9 +2394,43 @@ static bool check_new_pages(struct page *page, unsigned int order) return false; } +static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if either: + * + * 1. Memory tags have already been cleared via tag_clear_highpage(). + * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); +} + +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + set_page_private(page, 0); set_page_refcounted(page); @@ -2410,19 +2446,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order, /* * As memory initialization might be integrated into KASAN, - * kasan_alloc_pages and kernel_init_free_pages must be + * KASAN unpoisoning and memory initializion code must be * kept together to avoid discrepancies in behavior. */ - if (kasan_has_integrated_init()) { - kasan_alloc_pages(page, order, gfp_flags); - } else { - bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + int i; + + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } + if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { + /* Unpoison shadow memory or set memory tags. */ kasan_unpoison_pages(page, order, init); - if (init) - kernel_init_free_pages(page, 1 << order, - gfp_flags & __GFP_ZEROTAGS); + + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_free_pages(page, 1 << order); + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); @@ -2479,17 +2534,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* * This array describes the order lists are fallen back to when * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. */ static int fallbacks[MIGRATE_TYPES][3] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, -#ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ -#endif -#ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ -#endif }; #ifdef CONFIG_CMA @@ -2795,8 +2846,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) - && !is_migrate_cma(mt)) { + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); @@ -3371,8 +3422,8 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, unsigned long pfn, - int migratetype, unsigned int order) +static void free_unref_page_commit(struct page *page, int migratetype, + unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -3421,7 +3472,7 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype, order); + free_unref_page_commit(page, migratetype, order); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3431,13 +3482,13 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; - unsigned long flags, pfn; + unsigned long flags; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - pfn = page_to_pfn(page); + unsigned long pfn = page_to_pfn(page); if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); continue; @@ -3453,15 +3504,10 @@ void free_unref_page_list(struct list_head *list) free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); continue; } - - set_page_private(page, pfn); } local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { - pfn = page_private(page); - set_page_private(page, 0); - /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3471,7 +3517,7 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype, 0); + free_unref_page_commit(page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get @@ -3545,8 +3591,11 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && !is_migrate_highatomic(mt)) + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -3948,12 +3997,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4681,12 +4730,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -4879,14 +4928,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; @@ -6265,13 +6306,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) static void build_zonelists(pg_data_t *pgdat) { static int node_order[MAX_NUMNODES]; - int node, load, nr_nodes = 0; + int node, nr_nodes = 0; nodemask_t used_mask = NODE_MASK_NONE; int local_node, prev_node; /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = nr_online_nodes; prev_node = local_node; memset(node_order, 0, sizeof(node_order)); @@ -6281,13 +6321,13 @@ static void build_zonelists(pg_data_t *pgdat) * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) - node_load[node] += load; + if ((node_distance(local_node, node) != + node_distance(local_node, prev_node)) || + node == local_node) + node_load[node] += nr_online_nodes; node_order[nr_nodes++] = node; prev_node = node; - load--; } build_zonelists_in_node_order(pgdat, node_order, nr_nodes); @@ -6380,7 +6420,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { @@ -6402,7 +6442,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); @@ -7502,12 +7546,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, * NOTE: this function is only called during memory hotplug */ #ifdef CONFIG_MEMORY_HOTPLUG -void __ref free_area_init_core_hotplug(int nid) +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) { + int nid = pgdat->node_id; enum zone_type z; - pg_data_t *pgdat = NODE_DATA(nid); + int cpu; pgdat_init_internals(pgdat); + + if (pgdat->per_cpu_nodestats == &boot_nodestats) + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + + /* + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly + * when it starts in the near future. + */ + pgdat->nr_zones = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = 0; + pgdat->node_start_pfn = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } + for (z = 0; z < MAX_NR_ZONES; z++) zone_init_internals(&pgdat->node_zones[z], z, nid, 0); } @@ -7657,9 +7722,14 @@ static void __init free_area_init_node(int nid) pgdat->node_start_pfn = start_pfn; pgdat->per_cpu_nodestats = NULL; - pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, - end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + if (start_pfn != end_pfn) { + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + pr_info("Initmem setup node %d as memoryless\n", nid); + } + calculate_node_totalpages(pgdat, start_pfn, end_pfn); alloc_node_mem_map(pgdat); @@ -8096,8 +8166,36 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + arch_refresh_nodedata(nid, pgdat); + free_area_init_memoryless_node(nid); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); free_area_init_node(nid); /* Any memory on that node */ diff --git a/mm/page_owner.c b/mm/page_owner.c index 99e360df94652..d56afa9c792ed 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "internal.h" @@ -28,6 +29,7 @@ struct page_owner { depot_stack_handle_t free_handle; u64 ts_nsec; u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; pid_t pid; }; @@ -164,6 +166,8 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->ts_nsec = local_clock(); + strlcpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -231,6 +235,7 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) new_page_owner->pid = old_page_owner->pid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); /* * We don't clear the bit on the old folio as it's going to be freed @@ -325,6 +330,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, seq_putc(m, '\n'); } +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, @@ -338,19 +382,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (!kbuf) return -ENOMEM; - ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", + ret = scnprintf(kbuf, count, + "Page allocated via order %u, mask %#x(%pGg), pid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, - page_owner->ts_nsec, page_owner->free_ts_nsec); - - if (ret >= count) - goto err; + page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %pGp\n", pfn, migratetype_names[page_mt], @@ -358,21 +400,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[pageblock_mt], &page->flags); - if (ret >= count) - goto err; - ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); if (ret >= count) goto err; if (page_owner->last_migrate_reason != -1) { - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); - if (ret >= count) - goto err; } + ret = print_page_owner_memcg(kbuf, count, ret, page); + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -415,9 +454,10 @@ void __dump_page_owner(const struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->pid, page_owner->comm, page_owner->ts_nsec, + page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 2054c9213c433..f58d73c927892 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -84,15 +84,19 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, gfp_t gfp) { unsigned int cpu, tcpu; - int i; + int i, nid; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + if (nid == NUMA_NO_NODE || !node_online(nid)) + nid = numa_mem_id(); + for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; - *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + *pagep = alloc_pages_node(nid, gfp, 0); if (!*pagep) goto err; } diff --git a/mm/ptdump.c b/mm/ptdump.c index da751448d0e4e..eea3d28d173c2 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val)); - if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val)); - if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val)); - if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } @@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } return 0; } diff --git a/mm/readahead.c b/mm/readahead.c index cf0dcf89eb69b..feda2b1702f1b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -595,12 +595,6 @@ void page_cache_async_ra(struct readahead_control *ractl, folio_clear_readahead(folio); - /* - * Defer asynchronous read-ahead on IO congestion. - */ - if (inode_read_congested(ractl->mapping->host)) - return; - if (blk_cgroup_congested()) return; diff --git a/mm/rmap.c b/mm/rmap.c index 6a1e8c7f62136..b0356dc1ed424 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,9 @@ #include +#define CREATE_TRACE_POINTS #include +#include #include "internal.h" @@ -89,7 +91,7 @@ static inline struct anon_vma *anon_vma_alloc(void) anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { - atomic_set(&anon_vma->refcount, 1); + refcount_set(&anon_vma->refcount, 1); anon_vma->degree = 1; /* Reference for first vma */ anon_vma->parent = anon_vma; /* @@ -104,7 +106,7 @@ static inline struct anon_vma *anon_vma_alloc(void) static inline void anon_vma_free(struct anon_vma *anon_vma) { - VM_BUG_ON(atomic_read(&anon_vma->refcount)); + VM_BUG_ON(refcount_read(&anon_vma->refcount)); /* * Synchronize against page_lock_anon_vma_read() such that @@ -446,7 +448,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); - atomic_set(&anon_vma->refcount, 0); + refcount_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } @@ -496,7 +498,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -555,7 +557,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } /* trylock failed, we got to sleep */ - if (!atomic_inc_not_zero(&anon_vma->refcount)) { + if (!refcount_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } @@ -570,7 +572,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) rcu_read_unlock(); anon_vma_lock_read(anon_vma); - if (atomic_dec_and_test(&anon_vma->refcount)) { + if (refcount_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because @@ -812,7 +814,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if (vma->vm_flags & VM_LOCKED) { + if ((vma->vm_flags & VM_LOCKED) && + (!PageTransCompound(page) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, !pvmw.pte); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ @@ -851,7 +856,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, if (referenced) { pra->referenced++; - pra->vm_flags |= vma->vm_flags; + pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) @@ -1181,17 +1186,17 @@ void do_page_add_anon_rmap(struct page *page, __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } - if (unlikely(PageKsm(page))) { + if (unlikely(PageKsm(page))) unlock_page_memcg(page); - return; - } /* address might be in next vma when migration races vma_adjust */ - if (first) + else if (first) __page_set_anon_rmap(page, vma, address, flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); } /** @@ -1232,12 +1237,14 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to - * @compound: charge the page as compound or small page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, bool compound) +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { int i, nr = 1; @@ -1260,13 +1267,8 @@ void page_add_file_rmap(struct page *page, bool compound) nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { - struct page *head = compound_head(page); - VM_WARN_ON_ONCE(!PageLocked(page)); - - SetPageDoubleMap(head); - if (PageMlocked(page)) - clear_page_mlock(head); + SetPageDoubleMap(compound_head(page)); } if (!atomic_inc_and_test(&page->_mapcount)) goto out; @@ -1274,6 +1276,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); } static void page_remove_file_rmap(struct page *page, bool compound) @@ -1316,9 +1320,6 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); - - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1358,9 +1359,6 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = thp_nr_pages(page); } - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); } @@ -1368,11 +1366,13 @@ static void page_remove_anon_compound_rmap(struct page *page) /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, bool compound) +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { lock_page_memcg(page); @@ -1397,9 +1397,6 @@ void page_remove_rmap(struct page *page, bool compound) */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); - if (unlikely(PageMlocked(page))) - clear_page_mlock(page); - if (PageTransCompound(page)) deferred_split_huge_page(compound_head(page)); @@ -1414,6 +1411,8 @@ void page_remove_rmap(struct page *page, bool compound) */ out: unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); } /* @@ -1469,28 +1468,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_PAGE(!pvmw.pte, page); + /* - * If the page is mlock()d, we cannot swap it out. + * If the page is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * PTE-mapped THP are never marked as mlocked: so do - * not set it on a DoubleMap THP, nor on an Anon THP - * (which may still be PTE-mapped after DoubleMap was - * cleared). But stop unmapping even in those cases. - */ - if (!PageTransCompound(page) || (PageHead(page) && - !PageDoubleMap(page) && !PageAnon(page))) - mlock_vma_page(page); + /* Restore the mlock which got missed */ + mlock_vma_page(page, vma, false); page_vma_mapped_walk_done(&pvmw); ret = false; break; } - /* Unexpected PMD-mapped THP? */ - VM_BUG_ON_PAGE(!pvmw.pte, page); - subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; @@ -1553,7 +1545,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { hugetlb_count_sub(compound_nr(page), mm); @@ -1599,7 +1591,30 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { - if (!PageDirty(page)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = page_count(page); + map_count = page_mapcount(page); + + /* + * Order reads for page refcount and dirty flag; + * see __remove_mapping(). + */ + smp_rmb(); + + /* + * The only page refs must be from the isolation + * plus one or more rmap's (dropped by discard:). + */ + if ((ref_count == 1 + map_count) && + !PageDirty(page)) { /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1668,7 +1683,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } @@ -1835,7 +1852,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { unsigned long pfn = page_to_pfn(page); swp_entry_t entry; pte_t swp_pte; @@ -1861,6 +1878,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + trace_set_migration_pte(pvmw.address, pte_val(swp_pte), + compound_order(page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1873,7 +1892,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * memory are supported. */ subpage = page; - } else if (PageHWPoison(page)) { + } else if (PageHWPoison(subpage)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { hugetlb_count_sub(compound_nr(page), mm); @@ -1929,6 +1948,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + trace_set_migration_pte(address, pte_val(swp_pte), + compound_order(page)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1942,7 +1963,9 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, * * See Documentation/vm/mmu_notifier.rst */ - page_remove_rmap(subpage, PageHuge(page)); + page_remove_rmap(subpage, vma, PageHuge(page)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain(smp_processor_id()); put_page(page); } @@ -1976,7 +1999,8 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) TTU_SYNC))) return; - if (is_zone_device_page(page) && !is_device_private_page(page)) + if (is_zone_device_page(page) && + !is_dev_private_or_coherent_page(page)) return; /* @@ -1996,76 +2020,6 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) rmap_walk(page, &rwc); } -/* - * Walks the vma's mapping a page and mlocks the page if any locked vma's are - * found. Once one is found the page is locked and the scan can be terminated. - */ -static bool page_mlock_one(struct page *page, struct vm_area_struct *vma, - unsigned long address, void *unused) -{ - struct page_vma_mapped_walk pvmw = { - .page = page, - .vma = vma, - .address = address, - }; - - /* An un-locked vma doesn't have any pages to lock, continue the scan */ - if (!(vma->vm_flags & VM_LOCKED)) - return true; - - while (page_vma_mapped_walk(&pvmw)) { - /* - * Need to recheck under the ptl to serialise with - * __munlock_pagevec_fill() after VM_LOCKED is cleared in - * munlock_vma_pages_range(). - */ - if (vma->vm_flags & VM_LOCKED) { - /* - * PTE-mapped THP are never marked as mlocked; but - * this function is never called on a DoubleMap THP, - * nor on an Anon THP (which may still be PTE-mapped - * after DoubleMap was cleared). - */ - mlock_vma_page(page); - /* - * No need to scan further once the page is marked - * as mlocked. - */ - page_vma_mapped_walk_done(&pvmw); - return false; - } - } - - return true; -} - -/** - * page_mlock - try to mlock a page - * @page: the page to be mlocked - * - * Called from munlock code. Checks all of the VMAs mapping the page and mlocks - * the page if any are found. The page will be returned with PG_mlocked cleared - * if it is not mapped by any locked vmas. - */ -void page_mlock(struct page *page) -{ - struct rmap_walk_control rwc = { - .rmap_one = page_mlock_one, - .done = page_not_mapped, - .anon_lock = page_lock_anon_vma_read, - - }; - - VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); - VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - - /* Anon THP are only marked as mlocked when singly mapped */ - if (PageTransCompound(page) && PageAnon(page)) - return; - - rmap_walk(page, &rwc); -} - #ifdef CONFIG_DEVICE_PRIVATE struct make_exclusive_args { struct mm_struct *mm; @@ -2148,7 +2102,7 @@ static bool page_make_device_exclusive_one(struct page *page, * There is a reference on the page for the swap entry which has * been removed, so shouldn't take another. */ - page_remove_rmap(subpage, false); + page_remove_rmap(subpage, vma, false); } mmu_notifier_invalidate_range_end(&range); @@ -2257,7 +2211,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); - if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + if (root != anon_vma && refcount_dec_and_test(&root->refcount)) anon_vma_free(root); } @@ -2291,11 +2245,6 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) @@ -2344,11 +2293,6 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. - * - * When called from page_mlock(), the mmap_lock of the mm containing the vma - * where the page was found will be held for write. So, we won't recheck - * vm_flags for that VMA. That should be OK, because that vma shouldn't be - * LOCKED. */ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index db6df27c852a7..8aecd6b3896c7 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -34,6 +34,7 @@ #include #include +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP /** * struct vmemmap_remap_walk - walk vmemmap page table * @@ -53,8 +54,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +76,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); return 0; } +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +140,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -245,6 +263,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, entry); } +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { @@ -258,6 +296,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } @@ -300,10 +339,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE); - mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* @@ -383,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, return 0; } +#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/mm/sparse.c b/mm/sparse.c index d21c6e5910d07..952f06d8f3731 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -126,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section) } /* Validate the physical addressing limitations of the model */ -void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, +static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); diff --git a/mm/swap.c b/mm/swap.c index bcf3ac288b56d..842d5cd92cf64 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { }; /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct page *page) { @@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page) __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } + /* See comment on PageMlocked in release_pages() */ + if (unlikely(PageMlocked(page))) { + int nr_pages = thp_nr_pages(page); + + __ClearPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } __ClearPageWaiters(page); } @@ -482,22 +490,12 @@ EXPORT_SYMBOL(folio_add_lru); void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { - bool unevictable; - VM_BUG_ON_PAGE(PageLRU(page), page); - unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; - if (unlikely(unevictable) && !TestSetPageMlocked(page)) { - int nr_pages = thp_nr_pages(page); - /* - * We use the irq-unsafe __mod_zone_page_state because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); - count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - } - lru_cache_add(page); + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(page); + else + lru_cache_add(page); } /* @@ -636,6 +634,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + mlock_page_drain(cpu); } /** @@ -838,6 +837,7 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || + need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); @@ -969,6 +969,18 @@ void release_pages(struct page **pages, int nr) __clear_page_lru_flags(page); } + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(PageMlocked(page))) { + __ClearPageMlocked(page); + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + __ClearPageWaiters(page); list_add(&page->lru, &pages_to_free); @@ -1009,43 +1021,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + folio_set_lru(folio); /* - * A folio becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. - * 2) Before acquiring LRU lock to put the folio on the correct LRU - * and then - * a) do PageLRU check with lock [check_move_unevictable_pages] - * b) do PageLRU check before lock [clear_page_mlock] - * - * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need - * following strict ordering: - * - * #0: __pagevec_lru_add_fn #1: clear_page_mlock - * - * folio_set_lru() folio_test_clear_mlocked() - * smp_mb() // explicit ordering // above provides strict - * // ordering - * folio_test_mlocked() folio_test_lru() + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears PageMlocked while the LRU + * lock is held. * - * - * if '#1' does not observe setting of PG_lru by '#0' and - * fails isolation, the explicit barrier will make sure that - * folio_evictable check will put the folio on the correct - * LRU. Without smp_mb(), folio_set_lru() can be reordered - * after folio_test_mlocked() check and can make '#1' fail the - * isolation of the folio whose mlocked bit is cleared (#0 is - * also looking at the same folio) and the evictable folio will - * be stranded on an unevictable LRU. + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) */ - folio_set_lru(folio); - smp_mb__after_atomic(); - if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } diff --git a/mm/swapfile.c b/mm/swapfile.c index bf0df7aa7158f..a5183315dc585 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1167,16 +1167,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return NULL; } -static struct swap_info_struct *swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) - spin_lock(&p->lock); - return p; -} - static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, struct swap_info_struct *q) { @@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, - int *total_swapcount) -{ - int i, map_swapcount, _total_swapcount; - unsigned long offset = 0; - struct swap_info_struct *si; - struct swap_cluster_info *ci = NULL; - unsigned char *map = NULL; - int swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page); - } - - page = compound_head(page); - - _total_swapcount = map_swapcount = 0; - if (PageSwapCache(page)) { - swp_entry_t entry; - - entry.val = page_private(page); - si = _swap_info_get(entry); - if (si) { - map = si->swap_map; - offset = swp_offset(entry); - } - } - if (map) - ci = lock_cluster(si, offset); - for (i = 0; i < HPAGE_PMD_NR; i++) { - int mapcount = atomic_read(&page[i]._mapcount) + 1; - if (map) { - swapcount = swap_count(map[offset + i]); - _total_swapcount += swapcount; - } - map_swapcount = max(map_swapcount, mapcount + swapcount); - } - unlock_cluster(ci); - - if (PageDoubleMap(page)) - map_swapcount -= 1; - - if (total_swapcount) - *total_swapcount = _total_swapcount; - - return map_swapcount + compound_mapcount(page); -} - -/* - * We can write to an anon page without COW if there are no other references - * to it. And as a side-effect, free up its swap: because the old content - * on disk will never be read, and seeking back there to write new content - * later would only waste time away from clustering. - */ -bool reuse_swap_page(struct page *page) -{ - int count, total_swapcount; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (unlikely(PageKsm(page))) - return false; - count = page_trans_huge_map_swapcount(page, &total_swapcount); - if (count == 1 && PageSwapCache(page) && - (likely(!PageTransCompound(page)) || - /* The remaining swap count will be freed soon */ - total_swapcount == page_swapcount(page))) { - if (!PageWriteback(page)) { - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); - } else { - swp_entry_t entry; - struct swap_info_struct *p; - - entry.val = page_private(page); - p = swap_info_get(entry); - if (p->flags & SWP_STABLE_WRITES) { - spin_unlock(&p->lock); - return false; - } - spin_unlock(&p->lock); - } - } - - return count <= 1; -} - /* * If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. diff --git a/mm/usercopy.c b/mm/usercopy.c index d0d268135d96d..e7b0cb49daa1b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -70,17 +70,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len) * kmem_cache_create_usercopy() function to create the cache (and * carefully audit the whitelist range). */ -void usercopy_warn(const char *name, const char *detail, bool to_user, - unsigned long offset, unsigned long len) -{ - WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", - to_user ? "exposure" : "overwrite", - to_user ? "from" : "to", - name ? : "unknown?!", - detail ? " '" : "", detail ? : "", detail ? "'" : "", - offset, len); -} - void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0780c2a57ff11..15d3e97a6e045 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none(*dst_pte)) goto out_unlock; - if (page_in_cache) - page_add_file_rmap(page, false); - else + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } /* * Must happen after rmap, as mm_counter() checks mapping (via @@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ inc_mm_counter(dst_mm, mm_counter(page)); - if (newly_allocated) - lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4165304d35471..b454cf1a261f9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false; bool is_vmalloc_addr(const void *x) { - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); return addr >= VMALLOC_START && addr < VMALLOC_END; } @@ -118,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (size != PAGE_SIZE) { pte_t entry = pfn_pte(pfn, prot); - entry = pte_mkhuge(entry); entry = arch_make_huge_pte(entry, ilog2(size), 0); set_huge_pte_at(&init_mm, addr, pte, entry); pfn += PFN_DOWN(size); @@ -479,6 +478,8 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -632,7 +633,7 @@ int is_vmalloc_or_module_addr(const void *x) * just put it in the vmalloc space. */ #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) - unsigned long addr = (unsigned long)x; + unsigned long addr = (unsigned long)kasan_reset_tag(x); if (addr >= MODULES_VADDR && addr < MODULES_END) return 1; #endif @@ -776,23 +777,13 @@ get_subtree_max_size(struct rb_node *node) return va ? va->subtree_max_size : 0; } -/* - * Gets called when remove the node and rotate. - */ -static __always_inline unsigned long -compute_subtree_max_size(struct vmap_area *va) -{ - return max3(va_size(va), - get_subtree_max_size(va->rb_node.rb_left), - get_subtree_max_size(va->rb_node.rb_right)); -} - RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); -static unsigned long lazy_max_pages(void); +static void drain_vmap_area_work(struct work_struct *work); +static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); static atomic_long_t nr_vmalloc_pages; @@ -806,6 +797,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) struct vmap_area *va = NULL; struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *tmp; @@ -827,6 +820,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; + addr = (unsigned long)kasan_reset_tag((void *)addr); + while (n) { struct vmap_area *va; @@ -973,6 +968,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root) } #if DEBUG_AUGMENT_PROPAGATE_CHECK +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + static void augment_tree_propagate_check(void) { @@ -1189,22 +1195,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size, /* * Find the first free block(lowest start address) in the tree, * that will accomplish the request corresponding to passing - * parameters. + * parameters. Please note, with an alignment bigger than PAGE_SIZE, + * a search length is adjusted to account for worst case alignment + * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, - unsigned long align, unsigned long vstart) +find_vmap_lowest_match(unsigned long size, unsigned long align, + unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; + unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; + /* Adjust the search size for alignment overhead. */ + length = adjust_search_size ? size + align - 1 : size; + while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= size && + if (get_subtree_max_size(node->rb_left) >= length && vstart < va->va_start) { node = node->rb_left; } else { @@ -1214,9 +1226,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search size. + * equal or bigger to the requested search length. */ - if (get_subtree_max_size(node->rb_right) >= size) { + if (get_subtree_max_size(node->rb_right) >= length) { node = node->rb_right; continue; } @@ -1232,7 +1244,7 @@ find_vmap_lowest_match(unsigned long size, if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= size && + if (get_subtree_max_size(node->rb_right) >= length && vstart <= va->va_start) { /* * Shift the vstart forward. Please note, we update it with @@ -1280,7 +1292,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, align, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart, false); va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) @@ -1431,12 +1443,25 @@ static __always_inline unsigned long __alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { + bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; enum fit_type type; int ret; - va = find_vmap_lowest_match(size, align, vstart); + /* + * Do not adjust when: + * a) align <= PAGE_SIZE, because it does not make any sense. + * All blocks(their start addresses) are at least PAGE_SIZE + * aligned anyway; + * b) a short range where a requested size corresponds to exactly + * specified [vstart:vend] interval and an alignment > PAGE_SIZE. + * With adjusted search length an allocation would not succeed. + */ + if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) + adjust_search_size = false; + + va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1719,18 +1744,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) return true; } -/* - * Kick off a purge of the outstanding lazy areas. Don't bother if somebody - * is already purging. - */ -static void try_purge_vmap_area_lazy(void) -{ - if (mutex_trylock(&vmap_purge_lock)) { - __purge_vmap_area_lazy(ULONG_MAX, 0); - mutex_unlock(&vmap_purge_lock); - } -} - /* * Kick off a purge of the outstanding lazy areas. */ @@ -1742,6 +1755,20 @@ static void purge_vmap_area_lazy(void) mutex_unlock(&vmap_purge_lock); } +static void drain_vmap_area_work(struct work_struct *work) +{ + unsigned long nr_lazy; + + do { + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + + /* Recheck if further work is required. */ + nr_lazy = atomic_long_read(&vmap_lazy_nr); + } while (nr_lazy > lazy_max_pages()); +} + /* * Free a vmap area, caller ensuring that the area has been unmapped * and flush_cache_vunmap had been called for the correct range @@ -1768,7 +1795,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) - try_purge_vmap_area_lazy(); + schedule_work(&drain_vmap_work); } /* @@ -2145,7 +2172,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; - unsigned long addr = (unsigned long)mem; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); struct vmap_area *va; might_sleep(); @@ -2206,14 +2233,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) mem = (void *)addr; } - kasan_unpoison_vmalloc(mem, size); - if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, PAGE_SHIFT) < 0) { vm_unmap_ram(mem, count); return NULL; } + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); + return mem; } EXPORT_SYMBOL(vm_map_ram); @@ -2439,10 +2471,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - setup_vmalloc_vm(area, va, flags, caller); + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_PROT_NORMAL); + return area; } @@ -2526,7 +2568,7 @@ struct vm_struct *remove_vm_area(const void *addr) va->vm = NULL; spin_unlock(&vmap_area_lock); - kasan_free_shadow(vm); + kasan_free_module_shadow(vm); free_unmap_vmap_area(va); return vm; @@ -2925,7 +2967,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t orig_gfp_mask = gfp_mask; bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); @@ -2949,7 +2990,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2959,8 +3000,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, set_vm_area_page_order(area, page_shift - PAGE_SHIFT); page_order = vm_area_page_order(area); - area->nr_pages = vm_area_alloc_pages(gfp_mask, node, - page_order, nr_small_pages, area->pages); + area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (gfp_mask & __GFP_ACCOUNT) { @@ -2976,7 +3017,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; @@ -3004,7 +3045,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, memalloc_noio_restore(flags); if (ret < 0) { - warn_alloc(orig_gfp_mask, NULL, + warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; @@ -3051,7 +3092,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3104,10 +3146,50 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); + + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; + } + + /* Allocate physical pages and map them into vmalloc space. */ + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; + /* + * Mark the pages as accessible, now that they are mapped. + * The init condition should match the one in post_alloc_hook() + * (except for the should_skip_init() check) to make sure that memory + * is initialized under the same conditions regardless of the enabled + * KASAN mode. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). + */ + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3119,7 +3201,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!(vm_flags & VM_DEFER_KMEMLEAK)) kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: if (shift > PAGE_SHIFT) { @@ -3404,6 +3486,8 @@ long vread(char *buf, char *addr, unsigned long count) unsigned long buflen = count; unsigned long n; + addr = kasan_reset_tag(addr); + /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; @@ -3789,9 +3873,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, for (area = 0; area < nr_vms; area++) { if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) goto err_free_shadow; - - kasan_unpoison_vmalloc((void *)vas[area]->va_start, - sizes[area]); } /* insert all vm's */ @@ -3804,6 +3885,16 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, } spin_unlock(&vmap_area_lock); + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kfree(vas); return vms; diff --git a/mm/vmscan.c b/mm/vmscan.c index 090bfb605ecf0..5f471c1e279fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include #include +#include #include "internal.h" @@ -989,17 +990,6 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -static int may_write_to_inode(struct inode *inode) -{ - if (current->flags & PF_SWAPWRITE) - return 1; - if (!inode_write_congested(inode)) - return 1; - if (inode_to_bdi(inode) == current->backing_dev_info) - return 1; - return 0; -} - /* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent @@ -1066,8 +1056,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * forward progress (e.g. journalling workqueues or kthreads). */ if (!current_is_kswapd() && - current->flags & (PF_IO_WORKER|PF_KTHREAD)) + current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + cond_resched(); return; + } /* * These figures are pulled out of thin air. @@ -1199,8 +1191,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host)) - return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; @@ -1386,11 +1376,11 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* - * Mlock lost the isolation race with us. Let try_to_unmap() - * move the page to the unevictable list. + * The supposedly reclaimable page was found to be in a VM_LOCKED vma. + * Let the page, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_RECLAIM; + return PAGEREF_ACTIVATE; if (referenced_ptes) { /* @@ -1576,9 +1566,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * end of the LRU a second time. */ mapping = page_mapping(page); - if (((dirty || writeback) && mapping && - inode_write_congested(mapping->host)) || - (writeback && PageReclaim(page))) + if (writeback && PageReclaim(page)) stat->nr_congested++; /* @@ -1729,7 +1717,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, mapping = page_mapping(page); } } else if (unlikely(PageTransHuge(page))) { - /* Split file THP */ + /* Split file/lazyfree THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } @@ -2377,9 +2365,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, */ static int current_may_throttle(void) { - return !(current->flags & PF_LOCAL_THROTTLE) || - current->backing_dev_info == NULL || - bdi_write_congested(current->backing_dev_info); + return !(current->flags & PF_LOCAL_THROTTLE); } /* @@ -3955,6 +3941,13 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) return false; } +/* + * Keep the free pages on fast memory node a little more than the high + * watermark to accommodate the promoted pages. + */ +#define NUMA_BALANCING_PROMOTE_WATERMARK_DIV 4 +#define NUMA_BALANCING_PROMOTE_WATERMARK_MIN (10UL * 1024 * 1024 >> PAGE_SHIFT) + /* * Returns true if there is an eligible zone balanced for the request order * and highest_zoneidx @@ -3976,6 +3969,15 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue; mark = high_wmark_pages(zone); + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + numa_demotion_enabled && + next_demotion_node(pgdat->node_id) != NUMA_NO_NODE) { + unsigned long promote_mark; + + promote_mark = max(NUMA_BALANCING_PROMOTE_WATERMARK_MIN, + mark / NUMA_BALANCING_PROMOTE_WATERMARK_DIV); + mark += promote_mark; + } if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 4057372745d04..d5cc8d739fac1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "nr_swapcached", #endif +#ifdef CONFIG_NUMA_BALANCING + "pgpromote_success", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1385,6 +1388,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", +#ifdef CONFIG_KSM + "ksm_swpin_copy", +#endif #endif #ifdef CONFIG_X86 "direct_map_level2_splits", diff --git a/mm/zswap.c b/mm/zswap.c index cdf6950fcb2e3..3efd8cae315e7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Enable/disable handling same-value filled pages (enabled by default) */ +/* + * Enable/disable handling same-value filled pages (enabled by default). + * If disabled every page is considered non-same-value filled. + */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, bool, 0644); +/* Enable/disable handling non-same-value filled pages (enabled by default) */ +static bool zswap_non_same_filled_pages_enabled = true; +module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } + if (!zswap_non_same_filled_pages_enabled) { + ret = -EINVAL; + goto freepage; + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b01c36a15d9dd..071b3304f5d7c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -584,8 +584,7 @@ sub hash_show_words { our $zero_initializer = qr{(?:(?:0[xX])?0+$Int_type?|NULL|false)\b}; -our $logFunctions = qr{(?x: - printk(?:_ratelimited|_once|_deferred_once|_deferred|)| +our $logFunctionsCore = qr{(?x: (?:[a-z0-9]+_){1,2}(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| TP_printk| WARN(?:_RATELIMIT|_ONCE|)| @@ -594,6 +593,11 @@ sub hash_show_words { seq_vprintf|seq_printf|seq_puts )}; +our $logFunctions = qr{(?x: + printk(?:_ratelimited|_once|_deferred_once|_deferred|)| + $logFunctionsCore +)}; + our $allocFunctions = qr{(?x: (?:(?:devm_)? (?:kv|k|v)[czm]alloc(?:_array)?(?:_node)? | @@ -5551,6 +5555,7 @@ sub process { defined($stat) && defined($cond) && $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); + my $fixed_assign_in_if = 0; if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) { if (ERROR("ASSIGN_IN_IF", @@ -5575,6 +5580,7 @@ sub process { $newline .= ')'; $newline .= " {" if (defined($brace)); fix_insert_line($fixlinenr + 1, $newline); + $fixed_assign_in_if = 1; } } } @@ -5598,8 +5604,20 @@ sub process { $stat_real = "[...]\n$stat_real"; } - ERROR("TRAILING_STATEMENTS", - "trailing statements should be on next line\n" . $herecurr . $stat_real); + if (ERROR("TRAILING_STATEMENTS", + "trailing statements should be on next line\n" . $herecurr . $stat_real) && + !$fixed_assign_in_if && + $cond_lines == 0 && + $fix && $perl_version_ok && + $fixed[$fixlinenr] =~ /^\+(\s*)((?:if|while|for)\s*$balanced_parens)\s*(.*)$/) { + my $indent = $1; + my $test = $2; + my $rest = rtrim($4); + if ($rest =~ /;$/) { + $fixed[$fixlinenr] = "\+$indent$test"; + fix_insert_line($fixlinenr + 1, "$indent\t$rest"); + } + } } } @@ -6298,8 +6316,7 @@ sub process { } # check for logging functions with KERN_ - if ($line !~ /printk(?:_ratelimited|_once)?\s*\(/ && - $line =~ /\b$logFunctions\s*\(.*\b(KERN_[A-Z]+)\b/) { + if ($line =~ /\b$logFunctionsCore\s*\(.*\b(KERN_[A-Z]+)\b/) { my $level = $1; if (WARN("UNNECESSARY_KERN_LEVEL", "Possible unnecessary $level\n" . $herecurr) && @@ -7418,6 +7435,13 @@ sub process { WARN("MODULE_LICENSE", "unknown module license " . $extracted_string . "\n" . $herecurr); } + if (!$file && $extracted_string eq '"GPL v2"') { + if (WARN("MODULE_LICENSE", + "Prefer \"GPL\" over \"GPL v2\" - see commit bf7fbeeae6db (\"module: Cure the MODULE_LICENSE \"GPL\" vs. \"GPL v2\" bogosity\")\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\bMODULE_LICENSE\s*\(\s*"GPL v2"\s*\)/MODULE_LICENSE("GPL")/; + } + } } # check for sysctl duplicate constants diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h index b238dbc9eb858..56eec4445bc9e 100644 --- a/tools/include/linux/gfp.h +++ b/tools/include/linux/gfp.h @@ -12,7 +12,6 @@ #define __GFP_FS 0x80u #define __GFP_NOWARN 0x200u #define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u #define __GFP_ACCOUNT 0x100000u #define __GFP_DIRECT_RECLAIM 0x400000u #define __GFP_KSWAPD_RECLAIM 0x2000000u @@ -20,7 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM) #define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH | __GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 99d7ff9a8effe..e5b38d0b08fb5 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d6a54a1d9d3a1..4eda7c7c15694 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -115,23 +115,35 @@ ifdef building_out_of_srctree override LDFLAGS = endif -ifneq ($(O),) - BUILD := $(O)/kselftest +top_srcdir ?= ../../.. + +ifeq ("$(origin O)", "command line") + KBUILD_OUTPUT := $(O) +endif + +ifneq ($(KBUILD_OUTPUT),) + # Make's built-in functions such as $(abspath ...), $(realpath ...) cannot + # expand a shell special character '~'. We use a somewhat tedious way here. + abs_objtree := $(shell cd $(top_srcdir) && mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd) + $(if $(abs_objtree),, \ + $(error failed to create output directory "$(KBUILD_OUTPUT)")) + # $(realpath ...) resolves symlinks + abs_objtree := $(realpath $(abs_objtree)) + BUILD := $(abs_objtree)/kselftest + KHDR_INCLUDES := -I${abs_objtree}/usr/include else - ifneq ($(KBUILD_OUTPUT),) - BUILD := $(KBUILD_OUTPUT)/kselftest - else - BUILD := $(shell pwd) - DEFAULT_INSTALL_HDR_PATH := 1 - endif + BUILD := $(CURDIR) + abs_srctree := $(shell cd $(top_srcdir) && pwd) + KHDR_INCLUDES := -I${abs_srctree}/usr/include + DEFAULT_INSTALL_HDR_PATH := 1 endif # Prepare for headers install -top_srcdir ?= ../../.. include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) export KSFT_KHDR_INSTALL_DONE := 1 export BUILD +export KHDR_INCLUDES # set default goal to all, so make without a target runs all, even when # all isn't the first target in the file. @@ -156,7 +168,7 @@ khdr: ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install else - $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \ + $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$(abs_objtree)/usr \ ARCH=$(ARCH) -C $(top_srcdir) headers_install endif diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 12c5e27d32c16..551affb437fe1 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -10,6 +10,7 @@ TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir TEST_FILES := Makefile TEST_GEN_PROGS += recursion-depth +TEST_GEN_PROGS += null-argv EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* \ $(OUTPUT)/S_I*.test diff --git a/tools/testing/selftests/exec/null-argv.c b/tools/testing/selftests/exec/null-argv.c new file mode 100644 index 0000000000000..c19726e710d19 --- /dev/null +++ b/tools/testing/selftests/exec/null-argv.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test that empty argvs are swapped out for a single empty string. */ +#include +#include +#include +#include + +#include "../kselftest.h" + +#define FORK(exec) \ +do { \ + pid = fork(); \ + if (pid == 0) { \ + /* Child */ \ + exec; /* Some kind of exec */ \ + perror("# " #exec); \ + return 1; \ + } \ + check_result(pid, #exec); \ +} while (0) + +void check_result(pid_t pid, const char *msg) +{ + int wstatus; + + if (pid == (pid_t)-1) { + perror("# fork"); + ksft_test_result_fail("fork failed: %s\n", msg); + return; + } + if (waitpid(pid, &wstatus, 0) < 0) { + perror("# waitpid"); + ksft_test_result_fail("waitpid failed: %s\n", msg); + return; + } + if (!WIFEXITED(wstatus)) { + ksft_test_result_fail("child did not exit: %s\n", msg); + return; + } + if (WEXITSTATUS(wstatus) != 0) { + ksft_test_result_fail("non-zero exit: %s\n", msg); + return; + } + ksft_test_result_pass("%s\n", msg); +} + +int main(int argc, char *argv[], char *envp[]) +{ + pid_t pid; + static char * const args[] = { NULL }; + static char * const str[] = { "", NULL }; + + /* argc counting checks */ + if (argc < 1) { + fprintf(stderr, "# FAIL: saw argc == 0 (old kernel?)\n"); + return 1; + } + if (argc != 1) { + fprintf(stderr, "# FAIL: unknown argc (%d)\n", argc); + return 1; + } + if (argv[0][0] == '\0') { + /* Good, we found a NULL terminated string at argv[0]! */ + return 0; + } + + /* Test runner. */ + ksft_print_header(); + ksft_set_plan(5); + + FORK(execve(argv[0], str, NULL)); + FORK(execve(argv[0], NULL, NULL)); + FORK(execve(argv[0], NULL, envp)); + FORK(execve(argv[0], args, NULL)); + FORK(execve(argv[0], args, envp)); + + ksft_exit(ksft_cnt.ksft_pass == ksft_plan); +} diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 9a8c3700d7738..b8152c573e8aa 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -INCLUDES := -I../include -I../../ -I../../../../../usr/include/ \ - -I$(KBUILD_OUTPUT)/kselftest/usr/include -CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) +INCLUDES := -I../include -I../../ -I../../../../../usr/include/ +CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) $(KHDR_INCLUDES) LDLIBS := -lpthread -lrt HEADERS := \ diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index f1180987492c9..b8f248018174d 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -28,6 +28,7 @@ * * When all tests are finished, clean up and exit the program with one of: * + * ksft_finished(); * ksft_exit(condition); * ksft_exit_pass(); * ksft_exit_fail(); @@ -235,6 +236,15 @@ static inline int ksft_exit_fail(void) ksft_exit_fail(); \ } while (0) +/** + * ksft_finished() - Exit selftest with success if all tests passed + */ +#define ksft_finished() \ + ksft_exit(ksft_plan == \ + ksft_cnt.ksft_pass + \ + ksft_cnt.ksft_xfail + \ + ksft_cnt.ksft_xskip) + static inline int ksft_exit_fail_msg(const char *msg, ...) { int saved_errno = errno; diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a1ce7dd40e3ef..1771e188dd448 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -151,7 +151,7 @@ endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ - -I$(= HMM_COHERENCE_DEVICE_ONE); +} + FIXTURE_SETUP(hmm) { self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd = hmm_open(0); + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd, 0); } @@ -95,9 +146,11 @@ FIXTURE_SETUP(hmm2) self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd0 = hmm_open(0); + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(1); + self->fd1 = hmm_open(variant->device_number1); ASSERT_GE(self->fd1, 0); } @@ -144,6 +197,7 @@ static int hmm_dmirror_cmd(int fd, } buffer->cpages = cmd.cpages; buffer->faults = cmd.faults; + buffer->zone_device_type = cmd.zone_device_type; return 0; } @@ -211,6 +265,20 @@ static void hmm_nanosleep(unsigned int n) nanosleep(&t, NULL); } +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + /* * Simple NULL test of device open/close. */ @@ -875,7 +943,7 @@ TEST_F(hmm, migrate) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -923,7 +991,7 @@ TEST_F(hmm, migrate_fault) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -936,7 +1004,7 @@ TEST_F(hmm, migrate_fault) ASSERT_EQ(ptr[i], i); /* Migrate memory to the device again. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -976,7 +1044,7 @@ TEST_F(hmm, migrate_shared) ASSERT_NE(buffer->ptr, MAP_FAILED); /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, -ENOENT); hmm_buffer_free(buffer); @@ -1015,7 +1083,7 @@ TEST_F(hmm2, migrate_mixed) p = buffer->ptr; /* Migrating a protected area should be an error. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); ASSERT_EQ(ret, -EINVAL); /* Punch a hole after the first page address. */ @@ -1023,7 +1091,7 @@ TEST_F(hmm2, migrate_mixed) ASSERT_EQ(ret, 0); /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); ASSERT_EQ(ret, -EINVAL); /* Page 2 will be a read-only zero page. */ @@ -1055,13 +1123,13 @@ TEST_F(hmm2, migrate_mixed) /* Now try to migrate pages 2-5 to device 1. */ buffer->ptr = p + 2 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 4); /* Page 5 won't be migrated to device 0 because it's on device 1. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, -ENOENT); buffer->ptr = p; @@ -1070,8 +1138,12 @@ TEST_F(hmm2, migrate_mixed) } /* - * Migrate anonymous memory to device private memory and fault it back to system - * memory multiple times. + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. */ TEST_F(hmm, migrate_multiple) { @@ -1107,8 +1179,7 @@ TEST_F(hmm, migrate_multiple) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, - npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -1116,7 +1187,13 @@ TEST_F(hmm, migrate_multiple) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Fault pages back to system memory and check them. */ + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); @@ -1354,13 +1431,13 @@ TEST_F(hmm2, snapshot) /* Page 5 will be migrated to device 0. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); /* Page 6 will be migrated to device 1. */ buffer->ptr = p + 6 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); @@ -1377,9 +1454,16 @@ TEST_F(hmm2, snapshot) ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } hmm_buffer_free(buffer); } @@ -1685,4 +1769,82 @@ TEST_F(hmm, exclusive_cow) hmm_buffer_free(buffer); } +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + struct gup_test gup; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + gup.nr_pages_per_call = npages; + gup.addr = (unsigned long)buffer->ptr; + gup.gup_flags = FOLL_WRITE; + gup.size = size; + /* + * Calling gup_test ioctl. It will try to PIN_LONGTERM these device pages + * causing a migration back to system memory for both, private and coherent + * type pages. + */ + if (ioctl(gup_fd, PIN_LONGTERM_BENCHMARK, &gup)) { + perror("ioctl on PIN_LONGTERM_BENCHMARK\n"); + goto out_test; + } + + /* Take snapshot to make sure pages have been migrated to sys memory */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE); +out_test: + close(gup_fd); + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 2a7c33631a298..1d689084a54ba 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -3,9 +3,10 @@ * hugepage-mremap: * * Example of remapping huge page memory in a user application using the - * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The amount of memory used by this test is decided by a command - * line argument in MBs. If missing, the default amount is 10MB. + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. * * To make sure the test triggers pmd sharing and goes through the 'unshare' * path in the mremap code use 1GB (1024) or more. @@ -25,7 +26,6 @@ #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) -#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -107,17 +107,26 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { + size_t length; + + if (argc != 2 && argc != 3) { + printf("Usage: %s [length_in_MB] \n", argv[0]); + exit(1); + } + /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. Any additional args are ignored. + * the default length. */ - size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + if (argc == 3) + length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL; length = length > 0 ? length : DEFAULT_LENGTH_MB; length = MB_TO_BYTES(length); int ret = 0; - int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); + /* last arg is the hugetlb file name */ + int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -169,5 +178,8 @@ int main(int argc, char *argv[]) munmap(addr, length); + close(fd); + unlink(argv[argc-1]); + return ret; } diff --git a/tools/testing/selftests/vm/hugepage-vmemmap.c b/tools/testing/selftests/vm/hugepage-vmemmap.c new file mode 100644 index 0000000000000..557bdbd4f87e8 --- /dev/null +++ b/tools/testing/selftests/vm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include +#include +#include +#include +#include + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c index 93e7e7ffed337..957b9e18c7295 100644 --- a/tools/testing/selftests/vm/memfd_secret.c +++ b/tools/testing/selftests/vm/memfd_secret.c @@ -282,7 +282,7 @@ int main(int argc, char *argv[]) close(fd); - ksft_exit(!ksft_get_fail_cnt()); + ksft_finished(); } #else /* __NR_memfd_secret */ diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 75d4017413944..e10d50e0b8e83 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,7 +111,19 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap 256 +./hugepage-mremap $mnt/huge_mremap +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi +rm -f $mnt/huge_mremap + +echo "------------------------" +echo "running hugepage-vmemmap" +echo "------------------------" +./hugepage-vmemmap if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 0647b525a6256..539c9371e592a 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -40,11 +40,26 @@ check_test_requirements() load_driver() { - modprobe $DRIVER > /dev/null 2>&1 + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi if [ $? == 0 ]; then major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) mknod /dev/hmm_dmirror0 c $major 0 mknod /dev/hmm_dmirror1 c $major 1 + if [ $# -eq 2 ]; then + mknod /dev/hmm_dmirror2 c $major 2 + mknod /dev/hmm_dmirror3 c $major 3 + fi fi } @@ -58,7 +73,7 @@ run_smoke() { echo "Running smoke test. Note, this test provides basic coverage." - load_driver + load_driver $1 $2 $(dirname "${BASH_SOURCE[0]}")/hmm-tests unload_driver } @@ -75,6 +90,9 @@ usage() echo "# Smoke testing" echo "./${TEST_NAME}.sh smoke" echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo exit 0 } @@ -84,7 +102,7 @@ function run_test() usage else if [ "$1" = "smoke" ]; then - run_smoke + run_smoke $2 $3 else usage fi diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 2f49c9af1b582..50476ec6c0704 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -119,6 +119,9 @@ struct uffd_stats { ~(unsigned long)(sizeof(unsigned long long) \ - 1))) +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 9ebb84a9c7310..c8ec2d6b314de 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -20,22 +20,27 @@ #include #include #include +#include struct block_list { char *txt; + char *stacktrace; int len; int num; int page_num; + pid_t pid; + __u64 ts_nsec; + __u64 free_ts_nsec; }; -static int sort_by_memory; static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; -struct block_list *block_head; - int read_block(char *buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; @@ -58,6 +63,13 @@ static int compare_txt(const void *p1, const void *p2) return strcmp(l1->txt, l2->txt); } +static int compare_stacktrace(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->stacktrace, l2->stacktrace); +} + static int compare_num(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; @@ -72,34 +84,124 @@ static int compare_page_num(const void *p1, const void *p2) return l2->page_num - l1->page_num; } -static int get_page_num(char *buf) +static int compare_pid(const void *p1, const void *p2) { - int err, val_len, order_val; - char order_str[4] = {0}; - char *endptr; + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; regmatch_t pmatch[2]; - err = regexec(&order_pattern, buf, 2, pmatch, REG_NOTBOL); + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); if (err != 0 || pmatch[1].rm_so == -1) { - printf("no order pattern in %s\n", buf); - return 0; + printf("no matching pattern in %s\n", buf); + return -1; } val_len = pmatch[1].rm_eo - pmatch[1].rm_so; - if (val_len > 2) /* max_order should not exceed 2 digits */ - goto wrong_order; - memcpy(order_str, buf + pmatch[1].rm_so, val_len); + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} +static void check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + printf("Invalid pattern %s code %d\n", regex, err); + exit(1); + } +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&order_pattern, order_str, buf); errno = 0; order_val = strtol(order_str, &endptr, 10); - if (errno != 0 || endptr == order_str || *endptr != '\0') - goto wrong_order; + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + printf("wrong order in follow buf:\n%s\n", buf); + return 0; + } return 1 << order_val; +} + +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + printf("wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; -wrong_order: - printf("wrong order in follow buf:\n%s\n", buf); - return 0; +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + printf("wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + printf("wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; } static void add_list(char *buf, int len) @@ -121,6 +223,10 @@ static void add_list(char *buf, int len) list[list_size].page_num = get_page_num(buf); memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + list[list_size].pid = get_pid(buf); + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); @@ -132,25 +238,55 @@ static void add_list(char *buf, int len) static void usage(void) { - printf("Usage: ./page_owner_sort [-m] \n" - "-m Sort by total memory. If this option is unset, sort by times\n" + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m Sort by total memory.\n" + "-s Sort by the stack trace.\n" + "-t Sort by times (default).\n" + "-p Sort by pid.\n" + "-a Sort by memory allocate time.\n" + "-r Sort by memory release time.\n" + "-c Cull by comparing stacktrace instead of total block.\n" + "-f Filter out the information of blocks whose memory has not been released.\n" ); } int main(int argc, char **argv) { + int (*cmp)(const void *, const void *) = compare_num; + int cull_st = 0; + int filter = 0; FILE *fin, *fout; char *buf; int ret, i, count; struct block_list *list2; struct stat st; - int err; int opt; - while ((opt = getopt(argc, argv, "m")) != -1) + while ((opt = getopt(argc, argv, "acfmprst")) != -1) switch (opt) { + case 'a': + cmp = compare_ts; + break; + case 'c': + cull_st = 1; + break; + case 'f': + filter = 1; + break; case 'm': - sort_by_memory = 1; + cmp = compare_page_num; + break; + case 'p': + cmp = compare_pid; + break; + case 'r': + cmp = compare_free_ts; + break; + case 's': + cmp = compare_stacktrace; + break; + case 't': + cmp = compare_num; break; default: usage(); @@ -170,13 +306,10 @@ int main(int argc, char **argv) exit(1); } - err = regcomp(&order_pattern, "order\\s*([0-9]*),", REG_EXTENDED|REG_NEWLINE); - if (err != 0 || order_pattern.re_nsub != 1) { - printf("%s: Invalid pattern 'order\\s*([0-9]*),' code %d\n", - argv[0], err); - exit(1); - } - + check_regcomp(&order_pattern, "order\\s*([0-9]*),"); + check_regcomp(&pid_pattern, "pid\\s*([0-9]*),"); + check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,"); + check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns"); fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -199,7 +332,10 @@ int main(int argc, char **argv) printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_txt); + if (cull_st == 1) + qsort(list, list_size, sizeof(list[0]), compare_stacktrace); + else + qsort(list, list_size, sizeof(list[0]), compare_txt); list2 = malloc(sizeof(*list) * list_size); if (!list2) { @@ -209,9 +345,11 @@ int main(int argc, char **argv) printf("culling\n"); + long offset = cull_st ? &list[0].stacktrace - &list[0].txt : 0; + for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].txt, list[i].txt) != 0) { + strcmp(*(&list2[count-1].txt+offset), *(&list[i].txt+offset)) != 0) { list2[count++] = list[i]; } else { list2[count-1].num += list[i].num; @@ -219,15 +357,17 @@ int main(int argc, char **argv) } } - if (sort_by_memory) - qsort(list2, count, sizeof(list[0]), compare_page_num); - else - qsort(list2, count, sizeof(list[0]), compare_num); + qsort(list2, count, sizeof(list[0]), cmp); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { + if (filter == 1 && list2[i].free_ts_nsec != 0) + continue; fprintf(fout, "%d times, %d pages:\n%s\n", list2[i].num, list2[i].page_num, list2[i].txt); - + } regfree(&order_pattern); + regfree(&pid_pattern); + regfree(&ts_nsec_pattern); + regfree(&free_ts_nsec_pattern); return 0; }