diff --git a/.mailmap b/.mailmap index 71577c3962521..38255d412f0b3 100644 --- a/.mailmap +++ b/.mailmap @@ -78,6 +78,7 @@ Boris Brezillon Boris Brezillon Boris Brezillon Boris Brezillon +Brendan Higgins Brian Avery Brian King Brian Silverman @@ -230,7 +231,7 @@ Kees Cook Keith Busch Keith Busch Kenneth W Chen -Kirill Tkhai +Kirill Tkhai Konstantin Khlebnikov Konstantin Khlebnikov Koushik diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 96a043d9255af..be4a77baf7841 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1237,6 +1237,13 @@ PAGE_SIZE multiple when read back. the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. + Please note that the proactive reclaim (triggered by this + interface) is not meant to indicate memory pressure on the + memory cgroup. Therefore socket memory balancing triggered by + the memory reclaim normally is not exercised in this case. + This means that the networking layer will not adapt based on + reclaim induced by memory.reclaim. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 963a4358ed4fd..98e5cb91faab1 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1735,12 +1735,13 @@ hugetlb_free_vmemmap= [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. + Control if HugeTLB Vmemmap Optimization (HVO) is enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } + Format: { on | off (default) } - [oO][Nn]/Y/y/1: enable the feature - [oO][Ff]/N/n/0: disable the feature + on: enable HVO + off: disable HVO Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index a90330d0a8373..8e2727dc18d4d 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,8 +164,8 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing - unused vmemmap pages associated with each HugeTLB page. + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB + Vmemmap Optimization (HVO). When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` indicates the current number of pre-allocated huge pages of the default size. diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 0f56ecd8ac054..a3c9e8ad8fa0d 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -653,8 +653,8 @@ block might fail: - Concurrent activity that operates on the same physical memory area, such as allocating gigantic pages, can result in temporary offlining failures. -- Out of memory when dissolving huge pages, especially when freeing unused - vmemmap pages associated with each hugetlb page is enabled. +- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap + Optimization (HVO) is enabled. Offlining code may be able to migrate huge page contents, but may not be able to dissolve the source huge page because it fails allocating (unmovable) pages diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index f74f722ad7028..9b833e439f097 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -569,8 +569,7 @@ This knob is not available when the size of 'struct page' (a structure defined in include/linux/mm_types.h) is not power of two (an unusual system config could result in this). -Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages -associated with each HugeTLB page. +Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO). Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index e7aafc82be999..47e95dbc820d5 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1891,13 +1891,14 @@ if precise results are needed. 3.8 /proc//fdinfo/ - Information about opened file --------------------------------------------------------------- This file provides information associated with an opened file. The regular -files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'. +files have at least five fields -- 'pos', 'flags', 'mnt_id', 'ino', and 'size'. + The 'pos' represents the current offset of the opened file in decimal form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the file has been created with [see open(2) for details] and 'mnt_id' represents mount ID of the file system containing the opened file [see 3.5 /proc//mountinfo for details]. 'ino' represents the inode number of -the file. +the file, and 'size' represents the size of the file in bytes. A typical output is:: @@ -1905,11 +1906,15 @@ A typical output is:: flags: 0100002 mnt_id: 19 ino: 63107 + size: 0 All locks associated with a file descriptor are shown in its fdinfo too:: lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF +Files with anonymous inodes have an additional 'path' field which represents +the anonymous file path. + The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags pair provide additional information particular to the objects they represent. @@ -1922,6 +1927,8 @@ Eventfd files flags: 04002 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:[eventfd] eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1935,6 +1942,8 @@ Signalfd files flags: 04002 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:[signalfd] sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1949,6 +1958,8 @@ Epoll files flags: 02 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:[eventpoll] tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7 where 'tfd' is a target file descriptor number in decimal form, @@ -1967,6 +1978,8 @@ For inotify files the format is the following:: flags: 02000000 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:inotify inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d where 'wd' is a watch descriptor in decimal form, i.e. a target file @@ -1990,6 +2003,8 @@ For fanotify files the format is:: flags: 02 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:[fanotify] fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 @@ -2015,6 +2030,8 @@ Timerfd files flags: 02 mnt_id: 9 ino: 63107 + size: 0 + path: anon_inode:[timerfd] clockid: 0 ticks: 0 settime flags: 01 @@ -2039,6 +2056,7 @@ DMA Buffer files mnt_id: 9 ino: 63107 size: 32768 + path: /dmabuf: count: 2 exp_name: system-heap diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index c9c495f62d123..a4b12ff906c4d 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -7,23 +7,25 @@ A vmemmap diet for HugeTLB and Device DAX HugeTLB ======= -The struct page structures (page structs) are used to describe a physical -page frame. By default, there is a one-to-one mapping from a page frame to -it's corresponding page struct. +This section is to explain how HugeTLB Vmemmap Optimization (HVO) works. + +The ``struct page`` structures are used to describe a physical page frame. By +default, there is a one-to-one mapping from a page frame to it's corresponding +``struct page``. HugeTLB pages consist of multiple base page size pages and is supported by many architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. -For each base page, there is a corresponding page struct. +For each base page, there is a corresponding ``struct page``. -Within the HugeTLB subsystem, only the first 4 page structs are used to -contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides -this upper limit. The only 'useful' information in the remaining page structs +Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to +contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides +this upper limit. The only 'useful' information in the remaining ``struct page`` is the compound_head field, and this field is the same for all tail pages. -By removing redundant page structs for HugeTLB pages, memory can be returned +By removing redundant ``struct page`` for HugeTLB pages, memory can be returned to the buddy allocator for other uses. Different architectures support different HugeTLB pages. For example, the @@ -44,7 +46,7 @@ page. | | 64KB | 2MB | 512MB | 16GB | | +--------------+-----------+-----------+-----------+-----------+-----------+ -When the system boot up, every HugeTLB page has more than one struct page +When the system boot up, every HugeTLB page has more than one ``struct page`` structs which size is (unit: pages):: struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE @@ -74,10 +76,10 @@ Where n is how many pte entries which one page can contains. So the value of n is (PAGE_SIZE / sizeof(pte_t)). This optimization only supports 64-bit system, so the value of sizeof(pte_t) -is 8. And this optimization also applicable only when the size of struct page -is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +is 8. And this optimization also applicable only when the size of ``struct page`` +is a power of two. In most cases, the size of ``struct page`` is 64 bytes (e.g. x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the -size of struct page structs of it is 8 page frames which size depends on the +size of ``struct page`` structs of it is 8 page frames which size depends on the size of the base page. For the HugeTLB page of the pud level mapping, then:: @@ -86,7 +88,7 @@ For the HugeTLB page of the pud level mapping, then:: = PAGE_SIZE / 8 * 8 (pages) = PAGE_SIZE (pages) -Where the struct_size(pmd) is the size of the struct page structs of a +Where the struct_size(pmd) is the size of the ``struct page`` structs of a HugeTLB page of the pmd level mapping. E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB @@ -94,7 +96,7 @@ HugeTLB page consists in 4096. Next, we take the pmd level mapping of the HugeTLB page as an example to show the internal implementation of this optimization. There are 8 pages -struct page structs associated with a HugeTLB page which is pmd mapped. +``struct page`` structs associated with a HugeTLB page which is pmd mapped. Here is how things look before optimization:: @@ -122,10 +124,10 @@ Here is how things look before optimization:: +-----------+ The value of page->compound_head is the same for all tail pages. The first -page of page structs (page 0) associated with the HugeTLB page contains the 4 -page structs necessary to describe the HugeTLB. The only use of the remaining -pages of page structs (page 1 to page 7) is to point to page->compound_head. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4 +``struct page`` necessary to describe the HugeTLB. The only use of the remaining +pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page`` will be used for each HugeTLB page. This will allow us to free the remaining 7 pages to the buddy allocator. @@ -167,13 +169,37 @@ entries that can be cached in a single TLB entry. The contiguous bit is used to increase the mapping size at the pmd and pte (last) level. So this type of HugeTLB page can be optimized only when its -size of the struct page structs is greater than 1 page. +size of the ``struct page`` structs is greater than **1** page. Notice: The head vmemmap page is not freed to the buddy allocator and all tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) -associated with each HugeTLB page. The compound_head() can handle this -correctly (more details refer to the comment above compound_head()). +more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB +page) associated with each HugeTLB page. The ``compound_head()`` can handle +this correctly. There is only **one** head ``struct page``, the tail +``struct page`` with ``PG_head`` are fake head ``struct page``. We need an +approach to distinguish between those two different types of ``struct page`` so +that ``compound_head()`` can return the real head ``struct page`` when the +parameter is the tail ``struct page`` but with ``PG_head``. The following code +snippet describes how to distinguish between real and fake head ``struct page``. + +.. code-block:: c + + if (test_bit(PG_head, &page->flags)) { + unsigned long head = READ_ONCE(page[1].compound_head); + + if (head & 1) { + if (head == (unsigned long)page + 1) + /* head struct page */ + else + /* tail struct page */ + } else { + /* head struct page */ + } + } + +We can safely access the field of the **page[1]** with ``PG_head`` because the +page is a compound page composed with at least two contiguous pages. +The implementation refers to ``page_fixed_fake_head()``. Device DAX ========== @@ -187,7 +213,7 @@ PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). The differences with HugeTLB are relatively minor. -It only use 3 page structs for storing all information as opposed +It only use 3 ``struct page`` for storing all information as opposed to 4 on HugeTLB pages. There's no remapping of vmemmap given that device-dax memory is not part of diff --git a/MAINTAINERS b/MAINTAINERS index 8d2417ddf40fb..56af0182a93bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11074,6 +11074,7 @@ F: fs/smbfs_common/ KERNEL UNIT TESTING FRAMEWORK (KUnit) M: Brendan Higgins +M: David Gow L: linux-kselftest@vger.kernel.org L: kunit-dev@googlegroups.com S: Maintained diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 4aa996423b0d1..763929e814e9a 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -76,6 +76,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index cb64e4797d2a8..f4e20f75438f8 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -497,12 +497,6 @@ smp_cpus_done(unsigned int max_cpus) ((bogosum + 2500) / (5000/HZ)) % 100); } -int -setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static void send_ipi_message(const struct cpumask *to_whom, enum ipi_message_type operation) { diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 6c22a53711e9b..ad93fe6e4b77d 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -232,14 +232,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) return 0; } -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /*****************************************************************************/ /* Inter Processor Interrupt Handling */ /*****************************************************************************/ diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 73fc645fc4c7e..978db2d96b446 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -787,14 +787,6 @@ void panic_smp_self_stop(void) cpu_relax(); } -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - #ifdef CONFIG_CPU_FREQ static DEFINE_PER_CPU(unsigned long, l_p_j_ref); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 4e4c171f070bc..9dd08cd339c3f 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -381,6 +381,15 @@ static inline bool defer_reserve_crashkernel(void) # define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1) #endif +/* + * memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory + * of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into + * multiple parts. As a result, the number of memory regions is large. + */ +#ifdef CONFIG_EFI +#define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8) +#endif + #include #endif /* __ASM_MEMORY_H */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 62ed361a4376b..ffc5d76cf6955 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1078,14 +1078,6 @@ bool smp_crash_stop_failed(void) } #endif -/* - * not supported here - */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static bool have_cpu_die(void) { #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index fc4f710e9820f..5f9379b3c8c87 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -76,17 +76,10 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); void flush_dcache_page(struct page *page) { /* - * Only the head page's flags of HugeTLB can be cleared since the tail - * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more - * details can refer to vmemmap_remap_pte()). Although - * __sync_icache_dcache() only set PG_dcache_clean flag on the head - * page struct, there is more than one page struct with PG_dcache_clean - * associated with the HugeTLB page since the head vmemmap page frame - * is reused (more details can refer to the comments above - * page_fixed_fake_head()). + * HugeTLB pages are always fully mapped and only head page will be + * set PG_dcache_clean (see comments in __sync_icache_dcache()). */ - if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) + if (PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index 6bb38bc2f39b4..4b605aa2e1d65 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -243,11 +243,6 @@ void __init smp_cpus_done(unsigned int max_cpus) { } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void csky_start_secondary(void) { struct mm_struct *mm = &init_mm; diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 619c56420aa0c..4ba93e59370c4 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -240,11 +240,6 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) send_ipi(mask, IPI_CALL_FUNC); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void smp_start_cpus(void) { int i; diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 7b7b64eb31297..e2cc59db86bc2 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -333,9 +333,3 @@ smp_send_stop (void) { send_IPI_allbutself(IPI_CPU_STOP); } - -int -setup_profiling_timer (unsigned int multiplier) -{ - return -EINVAL; -} diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 1be428663c102..c6e1fc77c9968 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index d5f7362e8c245..dc023a9798035 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void) struct class *mt_class; -static int __init mt_init(void) +static int __init mips_mt_init(void) { struct class *mtc; @@ -243,4 +243,4 @@ static int __init mt_init(void) return 0; } -subsys_initcall(mt_init); +subsys_initcall(mips_mt_init); diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 27041db2c8b0f..e1419095a6f0a 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -197,12 +197,6 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 0); } -/* not supported, yet */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init set_smp_cross_call(void (*fn)(const struct cpumask *, unsigned int)) { smp_cross_call = fn; diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a7ea3204a5faa..22133a6a506ef 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -70,6 +70,8 @@ #define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ +#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */ + #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 24d0744c3b3ab..7dbd92cafae38 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -513,10 +513,3 @@ void __cpu_die(unsigned int cpu) pdc_cpu_rendezvous_unlock(); } - -#ifdef CONFIG_PROC_FS -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index bcefab484ea61..c037c26540ddc 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1674,13 +1674,6 @@ void start_secondary(void *unused) BUG(); } -#ifdef CONFIG_PROFILING -int setup_profiling_timer(unsigned int multiplier) -{ - return 0; -} -#endif - static void __init fixup_topology(void) { int i; diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 018e7dc45df6c..760a64518c585 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -64,12 +64,6 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) return phys_id == cpuid_to_hartid_map(cpu); } -/* Unsupported */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - static void ipi_stop(void) { set_cpu_online(smp_processor_id(), false); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 22b148e5a5f88..ad8094d955eba 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -174,11 +174,6 @@ void smp_call_function_interrupt(void) irq_exit(); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init smp_prepare_cpus(unsigned int max_cpus) { int i, cpuid, extra; diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a1f78e9ddaf37..a55295d1b9244 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1186,12 +1186,6 @@ void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs) preempt_enable(); } -/* /proc/profile writes can call this, don't __init it please. */ -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - void __init smp_prepare_cpus(unsigned int max_cpus) { } diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bd8ae0a7010ae..3415321c8240c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -98,8 +98,6 @@ static inline bool apic_from_smp_config(void) #include #endif -extern int setup_profiling_timer(unsigned int); - static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a4347605ab005..6d303d1d276c3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1115,11 +1115,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) set_irq_regs(old_regs); } -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - /* * Local APIC start and shutdown */ diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 509408da0da1e..6b3033845c6d3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -30,9 +30,15 @@ int pmd_huge(pmd_t pmd) (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } +/* + * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ int pud_huge(pud_t pud) { - return !!(pud_val(pud) & _PAGE_PSE); + return !pud_none(pud) && + (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 7966a58af472a..1ff0c858544fa 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -111,6 +111,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 51b502217d000..1014beb128025 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (mm) { mmap_read_lock(mm); - vma = alloc->vma; + vma = vma_lookup(mm, alloc->vma_addr); } if (!vma && need_mm) { @@ -313,16 +313,22 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, static inline void binder_alloc_set_vma(struct binder_alloc *alloc, struct vm_area_struct *vma) { - if (vma) - alloc->vma_vm_mm = vma->vm_mm; + unsigned long vm_start = 0; + /* - * If we see alloc->vma is not NULL, buffer data structures set up - * completely. Look at smp_rmb side binder_alloc_get_vma. - * We also want to guarantee new alloc->vma_vm_mm is always visible - * if alloc->vma is set. + * Allow clearing the vma with holding just the read lock to allow + * munmapping downgrade of the write lock before freeing and closing the + * file using binder_alloc_vma_close(). */ - smp_wmb(); - alloc->vma = vma; + if (vma) { + vm_start = vma->vm_start; + alloc->vma_vm_mm = vma->vm_mm; + mmap_assert_write_locked(alloc->vma_vm_mm); + } else { + mmap_assert_locked(alloc->vma_vm_mm); + } + + alloc->vma_addr = vm_start; } static inline struct vm_area_struct *binder_alloc_get_vma( @@ -330,11 +336,9 @@ static inline struct vm_area_struct *binder_alloc_get_vma( { struct vm_area_struct *vma = NULL; - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } + if (alloc->vma_addr) + vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr); + return vma; } @@ -817,7 +821,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers = 0; mutex_lock(&alloc->mutex); - BUG_ON(alloc->vma); + BUG_ON(alloc->vma_addr && + vma_lookup(alloc->vma_vm_mm, alloc->vma_addr)); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 7dea57a84c79b..1e4fd37af5e03 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,7 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct vm_area_struct *vma; + unsigned long vma_addr; struct mm_struct *vma_vm_mm; void __user *buffer; struct list_head buffers; diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index c2b323bc3b3a5..43a881073a428 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc) if (!binder_selftest_run) return; mutex_lock(&binder_selftest_lock); - if (!binder_selftest_run || !alloc->vma) + if (!binder_selftest_run || !alloc->vma_addr) goto done; pr_info("STARTED\n"); binder_selftest_alloc_offset(alloc, end_offset, 0); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d80fc11ab4673..92cb929a45b79 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1389,9 +1389,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, __GFP_HIGHMEM | __GFP_MOVABLE); - if (unlikely(!handle)) { + if (IS_ERR((void *)handle)) { zcomp_stream_put(zram->comp); - return -ENOMEM; + return PTR_ERR((void *)handle); } alloced_pages = zs_get_total_pages(zram->mem_pool); diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index efb4990b29e10..44574fbe74828 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -484,7 +484,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) { struct dma_buf *dmabuf = file->private_data; - seq_printf(m, "size:\t%zu\n", dmabuf->size); /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 2a8de975fe63b..555ae07ce0277 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index e07486f019992..0c2892ec68171 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -862,8 +862,7 @@ static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, unsigned long mb_id, unsigned long start_pfn) { - const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == - ZONE_MOVABLE; + const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); int new_state; switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { @@ -1158,8 +1157,7 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) */ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) { - const bool is_movable = page_zonenum(pfn_to_page(pfn)) == - ZONE_MOVABLE; + const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); int rc, retry_count; /* diff --git a/fs/Kconfig b/fs/Kconfig index 5976eb33535ff..a547307c1ae82 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -247,8 +247,7 @@ config HUGETLB_PAGE # # Select this config option from the architecture Kconfig, if it is preferred -# to enable the feature of minimizing overhead of struct page associated with -# each HugeTLB page. +# to enable the feature of HugeTLB Vmemmap Optimization (HVO). # config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool @@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON - bool "Default optimizing vmemmap pages of HugeTLB to on" + bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" default n depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap - pages associated with each HugeTLB page is default off. Say Y here - to enable optimizing vmemmap pages of HugeTLB by default. It can then - be disabled on the command line via hugetlb_free_vmemmap=off. + The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to + enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off + (boot command line) or hugetlb_optimize_vmemmap (sysctl). config MEMFD_CREATE def_bool TMPFS || HUGETLBFS diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 20336cb3c040f..fe0e374b02a3d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -11,7 +11,6 @@ #include #include -#include /* remove ASAP */ #include #include #include @@ -40,7 +39,6 @@ #include #include -static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -315,8 +313,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to generic_file_buffered_read(), we can't use that - * since it has PAGE_SIZE assumptions. + * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -1082,7 +1079,7 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); - /* If no limits set, just report 0 for max/free/used + /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; @@ -1309,7 +1306,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par ps = memparse(param->string, &rest); ctx->hstate = size_to_hstate(ps); if (!ctx->hstate) { - pr_err("Unsupported page size %lu MB\n", ps >> 20); + pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } return 0; @@ -1385,7 +1382,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken - * taken when the subpool is created. + * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, @@ -1555,7 +1552,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", - huge_page_size(h) >> 10); + huge_page_size(h) / SZ_1K); return mnt; } diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31d..6911749b4da79 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1217,6 +1217,15 @@ void kfree_link(void *p) } EXPORT_SYMBOL(kfree_link); +static const struct address_space_operations anon_aops = { + .dirty_folio = noop_dirty_folio, +}; + +bool is_anon_inode(struct inode *inode) +{ + return inode->i_mapping->a_ops == &anon_aops; +} + struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 9099d8fc75999..22da768e65b7c 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -2,12 +2,13 @@ /* * heartbeat.c * - * Register ourselves with the heartbaet service, keep our node maps + * Register ourselves with the heartbeat service, keep our node maps * up to date, and fire off recovery when needed. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ +#include #include #include #include @@ -24,18 +25,12 @@ #include "buffer_head_io.h" -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit); -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit); - /* special case -1 for now * TODO: should *really* make sure the calling func never passes -1!! */ static void ocfs2_node_map_init(struct ocfs2_node_map *map) { map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; - memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * - sizeof(unsigned long)); + bitmap_zero(map->map, OCFS2_NODE_MAP_MAX_NODES); } void ocfs2_init_node_maps(struct ocfs2_super *osb) @@ -65,12 +60,6 @@ void ocfs2_do_node_down(int node_num, void *data) ocfs2_recovery_thread(osb, node_num); } -static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, - int bit) -{ - set_bit(bit, map->map); -} - void ocfs2_node_map_set_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -79,16 +68,10 @@ void ocfs2_node_map_set_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_set_bit(map, bit); + set_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } -static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, - int bit) -{ - clear_bit(bit, map->map); -} - void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, struct ocfs2_node_map *map, int bit) @@ -97,7 +80,7 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, return; BUG_ON(bit >= map->num_nodes); spin_lock(&osb->node_map_lock); - __ocfs2_node_map_clear_bit(map, bit); + clear_bit(bit, map->map); spin_unlock(&osb->node_map_lock); } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388e..b98e1531dfcb5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -454,8 +454,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -599,6 +603,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -635,7 +641,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); @@ -2028,8 +2035,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -2490,6 +2501,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2598,13 +2610,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2761,11 +2776,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2779,14 +2794,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2799,7 +2807,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2863,7 +2871,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2887,10 +2895,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a36..b2ebaec86ddca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4300,6 +4302,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ diff --git a/fs/proc/array.c b/fs/proc/array.c index 3fad797cdd560..49283b8103c7e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -99,6 +99,10 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char tcomm[64]; + /* + * Test before PF_KTHREAD because all workqueue worker threads are + * kernel threads. + */ if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else if (p->flags & PF_KTHREAD) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36c..5bac79a2fa515 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -23,6 +23,7 @@ static int seq_show(struct seq_file *m, void *v) struct files_struct *files = NULL; int f_flags = 0, ret = -ENOENT; struct file *file = NULL; + struct inode *inode = NULL; struct task_struct *task; task = get_proc_task(m->private); @@ -54,10 +55,19 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", - (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id, - file_inode(file)->i_ino); + inode = file_inode(file); + + seq_printf(m, "pos:\t%lli\n", (long long)file->f_pos); + seq_printf(m, "flags:\t0%o\n", f_flags); + seq_printf(m, "mnt_id:\t%i\n", real_mount(file->f_path.mnt)->mnt_id); + seq_printf(m, "ino:\t%lu\n", inode->i_ino); + seq_printf(m, "size:\t%lli\n", (long long)inode->i_size); + + if (is_anon_inode(inode)) { + seq_puts(m, "path:\t"); + seq_file_path(m, file, "\n"); + seq_putc(m, '\n'); + } /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fd40d60169b5a..f130499ad8432 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -212,7 +212,15 @@ static void unuse_pde(struct proc_dir_entry *pde) complete(pde->pde_unload_completion); } -/* pde is locked on entry, unlocked on exit */ +/* + * At most 2 contexts can enter this function: the one doing the last + * close on the descriptor and whoever is deleting PDE itself. + * + * First to enter calls ->proc_release hook and signals its completion + * to the second one which waits and then does nothing. + * + * PDE is locked on entry, unlocked on exit. + */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) __releases(&pde->pde_unload_lock) { @@ -222,9 +230,6 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) * * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: * "struct file" needs to be available at the right moment. - * - * Therefore, first process to enter this function does ->release() and - * signals its completion to the other process which does nothing. */ if (pdeo->closing) { /* somebody else is doing that, just wait */ @@ -238,10 +243,12 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); + file = pdeo->file; pde->proc_ops->proc_release(file_inode(file), file); + spin_lock(&pde->pde_unload_lock); - /* After ->release. */ + /* Strictly after ->proc_release, see above. */ list_del(&pdeo->lh); c = pdeo->c; spin_unlock(&pde->pde_unload_lock); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index bbce6fbe779c8..856839b8ae8b7 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -350,6 +350,12 @@ static __net_init int proc_net_ns_init(struct net *net) kgid_t gid; int err; + /* + * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. + * Corresponding inode (PDE(inode) == net->proc_net) is never + * instantiated therefore blanket zeroing is fine. + * net->proc_net_stat inode is instantiated normally. + */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) diff --git a/fs/proc/root.c b/fs/proc/root.c index 5a7d15d197f8e..3c2ee3eb1138a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -302,6 +302,11 @@ void __init proc_root_init(void) proc_mkdir("bus", NULL); proc_sys_init(); + /* + * Last things last. It is not like userspace processes eager + * to open /proc files exist at this point but register last + * anyway. + */ register_filesystem(&proc_fs_type); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3398d0f1927f..9913f3be9fd25 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0b..477c89a519ee8 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 7f0904b203294..98e64fec75b77 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -496,7 +497,137 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return res; } +static int squashfs_readahead_fragment(struct page **page, + unsigned int pages, unsigned int expected) +{ + struct inode *inode = page[0]->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + + if (buffer->error) + goto out; + + expected += squashfs_i(inode)->fragment_offset; + + for (n = 0; n < pages; n++) { + unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; + unsigned int offset = base + squashfs_i(inode)->fragment_offset; + + if (expected > offset) { + unsigned int avail = min_t(unsigned int, expected - + offset, PAGE_SIZE); + + squashfs_fill_page(page[n], buffer, offset, avail); + } + + unlock_page(page[n]); + put_page(page[n]); + } + +out: + squashfs_cache_put(buffer); + return buffer->error; +} + +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + if (index == file_end && squashfs_i(inode)->fragment_block != + SQUASHFS_INVALID_BLK) { + res = squashfs_readahead_fragment(pages, nr_pages, + expected); + if (res) + goto skip_pages; + continue; + } + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 37523c54256fa..24841d28bc0fb 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,51 +6,6 @@ * Phillip Lougher */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; @@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) actor->alloc_buffer = 0; } #endif -#endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28f..32565dafa7f3b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index de86f5b2859f9..1c44bf75f9160 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1925,10 +1925,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - features = uffdio_api.features; - ret = -EINVAL; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) - goto err_out; + /* Ignore unsupported features (userspace built against newer kernel) */ + features = uffdio_api.features & UFFD_API_FEATURES; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index aa8dc27c599ce..69d9c83ea4b21 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -18,6 +18,7 @@ #include "xfs_rmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_trans.h" +#include "xfs_ag.h" #include #include @@ -122,12 +123,16 @@ xfs_dax_notify_ddev_failure( struct failure_info notify; struct xfs_agf *agf; xfs_agblock_t agend; + struct xfs_perag *pag; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp); - if (error) + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); + if (error) { + xfs_perag_put(pag); break; + } - cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag); + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag); /* * Set the rmap range from ri_low to ri_high, which represents @@ -148,6 +153,7 @@ xfs_dax_notify_ddev_failure( xfs_dax_failure_fn, ¬ify); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agf_bp); + xfs_perag_put(pag); if (error) break; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3d27ba1295c92..9ac59814bbb6c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -351,8 +351,10 @@ xfs_setup_dax_always( goto disable_dax; } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, "DAX and reflink cannot be used together!"); + if (xfs_has_reflink(mp) && + bdev_is_partition(mp->m_ddev_targp->bt_bdev)) { + xfs_alert(mp, + "DAX and reflink cannot work with multi-partitions!"); return -EINVAL; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e863c88df95f9..ae12696ec492c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -28,11 +28,6 @@ enum wb_state { WB_start_all, /* nr_pages == 0 (all) work pending */ }; -enum wb_congested_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ -}; - enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, @@ -122,8 +117,6 @@ struct bdi_writeback { atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; - unsigned long congested; /* WB_[a]sync_congested flags */ - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e84b745a68119..439815cc1ab96 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_to_wb_is_valid - test whether an inode has a wb associated - * @inode: inode of interest - * - * Returns %true if @inode has a wb associated. May be called without any - * locking. - */ -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return inode->i_wb; -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return true; -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5113f65c786ff..ac7742a37cb9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3254,6 +3254,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); +extern bool is_anon_inode(struct inode *inode); void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e17..5088637fe5c2d 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 768e5261fdaed..38265f9f782e9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } @@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - BUG(); - return 0; + return -EINVAL; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + return -EINVAL; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4cdfce9766446..3ec981a0d8b3a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -42,6 +42,9 @@ enum { SUBPAGE_INDEX_CGROUP, /* reuse page->private */ SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, +#endif +#ifdef CONFIG_MEMORY_FAILURE + SUBPAGE_INDEX_HWPOISON, #endif __NR_USED_SUBPAGE, }; @@ -551,7 +554,7 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. - * HPG_temporary - - Set on a page that is temporarily allocated from the buddy + * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. @@ -561,6 +564,8 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. + * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page + * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, @@ -568,6 +573,7 @@ enum hugetlb_page_flags { HPG_temporary, HPG_freed, HPG_vmemmap_optimized, + HPG_raw_hwp_unreliable, __NR_HPAGEFLAGS, }; @@ -614,6 +620,7 @@ HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) +HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) #ifdef CONFIG_HUGETLB_PAGE @@ -638,9 +645,6 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP - unsigned int optimize_vmemmap_pages; -#endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ struct cftype cgroup_files_dfl[8]; @@ -716,7 +720,7 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return hstate_file(vma->vm_file); } -static inline unsigned long huge_page_size(struct hstate *h) +static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } @@ -745,7 +749,7 @@ static inline bool hstate_is_gigantic(struct hstate *h) return huge_page_order(h) >= MAX_ORDER; } -static inline unsigned int pages_per_huge_page(struct hstate *h) +static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } @@ -799,6 +803,14 @@ extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); +#ifdef CONFIG_MEMORY_FAILURE +extern void hugetlb_clear_page_hwpoison(struct page *hpage); +#else +static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +{ +} +#endif + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) diff --git a/include/linux/mm.h b/include/linux/mm.h index 18e01474cf6b6..3bedc449c14d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3142,13 +3142,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse); -int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask); -#endif - void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, @@ -3183,6 +3176,7 @@ enum mf_flags { MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); @@ -3235,7 +3229,6 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, - MF_MSG_NON_PMD_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578247a341b23..e24b40c52468a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else -static inline bool has_managed_dma(void) -{ - return false; -} -#endif - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -1155,12 +1146,17 @@ static inline bool has_managed_dma(void) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); +} + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); #else - return 0; -#endif +static inline bool has_managed_dma(void) +{ + return false; } +#endif /* These two functions are used to setup the per zone pages min values */ struct ctl_table; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ea19528564d1b..465ff35a8c00a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -205,34 +205,15 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); - -static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) -{ - return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - &hugetlb_optimize_vmemmap_key); -} +DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* - * If the feature of optimizing vmemmap pages associated with each HugeTLB - * page is enabled, the head vmemmap page frame is reused and all of the tail - * vmemmap addresses map to the head vmemmap page frame (furture details can - * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other - * words, there are more than one page struct with PG_head associated with each - * HugeTLB page. We __know__ that there is only one head page struct, the tail - * page structs with PG_head are fake head page structs. We need an approach - * to distinguish between those two different types of page structs so that - * compound_head() can return the real head page struct when the parameter is - * the tail page struct but with PG_head. - * - * The page_fixed_fake_head() returns the real head page struct if the @page is - * fake page head, otherwise, returns @page which can either be a true page - * head or tail. + * Return the real head page struct iff the @page is a fake head page, otherwise + * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_optimize_vmemmap_enabled()) + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* @@ -260,11 +241,6 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } - -static inline bool hugetlb_optimize_vmemmap_enabled(void) -{ - return false; -} #endif static __always_inline int page_is_fake_head(struct page *page) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a68f982f22d16..1b6c4013f691b 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,9 +25,20 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ + unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; }; +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE +#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4e84f8b553090..43150b9bbc5ca 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bb7afd03a324f..a3d435bf9f972 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -490,6 +490,11 @@ static inline void num_poisoned_pages_dec(void) atomic_long_dec(&num_poisoned_pages); } +static inline void num_poisoned_pages_sub(long i) +{ + atomic_long_sub(i, &num_poisoned_pages); +} + #else static inline swp_entry_t make_hwpoison_entry(struct page *page) @@ -505,6 +510,10 @@ static inline int is_hwpoison_entry(swp_entry_t swp) static inline void num_poisoned_pages_inc(void) { } + +static inline void num_poisoned_pages_sub(long i) +{ +} #endif static inline int non_swap_entry(swp_entry_t entry) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 17b42ce89d3ec..780690dc08cda 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -268,6 +268,10 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } +static inline void register_sysctl_init(const char *path, struct ctl_table *table) +{ +} + static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index d0337a41141c8..cbd3ddd7c33d4 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -360,7 +360,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index d651f3437367d..55392bf30a034 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3c..11524cda4a955 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e44..6ce1f1ceb432c 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/init/main.c b/init/main.c index 91642a4e69be6..2d538f0e0e68d 100644 --- a/init/main.c +++ b/init/main.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -232,13 +232,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -475,7 +475,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -504,7 +504,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -529,7 +530,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -729,7 +731,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1348,8 +1351,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1488,7 +1493,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2956bc5a90b2f..b3f71955c57f2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -244,7 +244,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/profile.c b/kernel/profile.c index ae82ddfc6a684..7ea01ba30e757 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -425,6 +425,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) return read; } +/* default is to not implement this call */ +int __weak setup_profiling_timer(unsigned mult) +{ + return -EINVAL; +} + /* * Writing to /proc/profile resets the counters * @@ -435,8 +441,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { #ifdef CONFIG_SMP - extern int setup_profiling_timer(unsigned int multiplier); - if (count == sizeof(int)) { unsigned int multiplier; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 337d797a71416..6f8e5dd1dcd0c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu) struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; + unsigned long flags; /* Remote access is safe as the CPU is dead already */ percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); @@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu) hlist_del(&obj->node); kmem_cache_free(obj_cache, obj); } + + raw_spin_lock_irqsave(&pool_lock, flags); + obj_pool_used -= percpu_pool->obj_free; + debug_objects_freed += percpu_pool->obj_free; + raw_spin_unlock_irqrestore(&pool_lock, flags); + percpu_pool->obj_free = 0; return 0; @@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void) hlist_add_head(&obj->node, &objects); } + debug_objects_allocated += i; + /* * debug_objects_mem_init() is now called early that only one CPU is up * and interrupts have been disabled, so it is safe to replace the @@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; kmem_cache_destroy(obj_cache); pr_warn("out of memory.\n"); + return; } else debug_objects_selftest(); diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index 76758e9296ba6..9d31e7126606a 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -50,9 +50,7 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, if (dv == 0 && bitstream_version) { const unsigned char *ir = ip + 4; - const unsigned char *limit = ip_end - < (ip + MAX_ZERO_RUN_LENGTH + 1) - ? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1; + const unsigned char *limit = min(ip_end, ip + MAX_ZERO_RUN_LENGTH + 1); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ defined(LZO_FAST_64BIT_MEMORY_ACCESS) u64 dv64; @@ -326,7 +324,7 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, data_start = op; while (l > 20) { - size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1); + size_t ll = min_t(size_t, l, m4_max_offset + 1); uintptr_t ll_end = (uintptr_t) ip + ll; if ((ll_end + ((t + ll) >> 5)) <= ll_end) break; diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c index bc81419f400c5..aa8c46544af8e 100644 --- a/lib/mpi/mpiutil.c +++ b/lib/mpi/mpiutil.c @@ -272,7 +272,7 @@ MPI mpi_set_ui(MPI w, unsigned long u) if (!w) w = mpi_alloc(1); /* FIXME: If U is 0 we have no need to resize and thus possible - * allocating the the limbs. + * allocating the limbs. */ RESIZE_IF_NEEDED(w, 1); w->d[0] = u; diff --git a/lib/test_printf.c b/lib/test_printf.c index 3b017566bbd21..e27d1a85521e7 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -678,17 +678,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 2e7704955f4f3..c3ffe253e0552 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,7 +163,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[16]; + char name[CMA_MAX_NAME]; scnprintf(name, sizeof(name), "cma-%s", cma->name); diff --git a/mm/compaction.c b/mm/compaction.c index 962d05d1e187d..640fa76228dd9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -613,6 +613,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; set_page_private(page, order); + nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; list_add_tail(&page->lru, freelist); @@ -1099,6 +1100,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); + nr_scanned += compound_nr(page) - 1; /* * Avoid isolating too much unless this block is being @@ -1502,6 +1504,7 @@ fast_isolate_freepages(struct compact_control *cc) if (__isolate_free_page(page, order)) { set_page_private(page, order); nr_isolated = 1 << order; + nr_scanned += nr_isolated - 1; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e69b807fefe43..a7faf51b4bd4a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -435,8 +435,10 @@ static int __init damon_reclaim_init(void) if (!ctx) return -ENOMEM; - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); return -EINVAL; + } ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; diff --git a/mm/filemap.c b/mm/filemap.c index 0dec96ea66887..15800334147b3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int err = 0; + int err = 0, err2; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, @@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ - if (err != -EIO) { - int err2 = filemap_fdatawait_range(mapping, - lstart, lend); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); } + err2 = filemap_check_errors(mapping); + if (!err) + err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); diff --git a/mm/gup.c b/mm/gup.c index 79cb9fb1d889f..7328251574307 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1982,8 +1982,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, folio_nr_pages(folio)); } - if (!list_empty(&movable_page_list) || isolation_error_count - || coherent_pages) + if (!list_empty(&movable_page_list) || isolation_error_count || + coherent_pages) goto unpin_pages; /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a7c1b344abef..0611b2fd145a4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -70,9 +70,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -121,11 +120,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ @@ -2266,25 +2264,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) + if (!pmd) return; - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index caf5559651391..28516881a1b2f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1535,7 +1535,14 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (hugetlb_vmemmap_alloc(h, page)) { + /* + * If we don't know which subpages are hwpoisoned, we can't free + * the hugepage, so it's leaked intentionally. + */ + if (HPageRawHwpUnreliable(page)) + return; + + if (hugetlb_vmemmap_restore(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) return; } + /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(PageHWPoison(page))) + hugetlb_clear_page_hwpoison(page); + for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1612,7 +1626,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (hugetlb_optimize_vmemmap_pages(h)) + if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } @@ -1734,7 +1748,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_huge_page(struct hstate *h, struct page *page) { - hugetlb_vmemmap_free(h, page); + hugetlb_vmemmap_optimize(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2107,17 +2121,8 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_alloc(h, head); + rc = hugetlb_vmemmap_restore(h, head); if (!rc) { - /* - * Move PageHWPoison flag from head page to the raw - * error page, which makes any subpages rather than - * the error page reusable. - */ - if (PageHWPoison(head) && page != head) { - SetPageHWPoison(page); - ClearPageHWPoison(head); - } update_and_free_page(h, head, false); } else { spin_lock_irq(&hugetlb_lock); @@ -2432,8 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* @@ -3182,8 +3186,10 @@ static void __init report_hugepages(void) char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", + pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); + pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", + hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } @@ -3421,7 +3427,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_page_for_demote(h, page, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_alloc(h, page); + rc = hugetlb_vmemmap_restore(h, page); if (rc) { /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); @@ -4111,7 +4117,6 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - hugetlb_vmemmap_init(h); parsed_hstate = h; } @@ -6985,10 +6990,38 @@ struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags) { - if (flags & (FOLL_GET | FOLL_PIN)) + struct page *page = NULL; + spinlock_t *ptl; + pte_t pte; + + if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; - return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +retry: + ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); + if (!pud_huge(*pud)) + goto out; + pte = huge_ptep_get((pte_t *)pud); + if (pte_present(pte)) { + page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(pte)) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pud, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; } struct page * __weak diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f9942841df18b..c86691c431fd7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -772,6 +772,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_cgroup_read_numa_stat; cft->flags = CFTYPE_NOT_ON_ROOT; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1362feb3c6c98..20f414c0379f9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song * @@ -10,84 +10,443 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include +#include +#include +#include +#include #include "hugetlb_vmemmap.h" -/* - * There are a lot of struct page structures associated with each HugeTLB page. - * For tail pages, the value of compound_head is the same. So we can reuse first - * page of head page structures. We map the virtual addresses of all the pages - * of tail page structures to the head page struct, and then free these page - * frames. Therefore, we need to reserve one pages as vmemmap areas. +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. */ -#define RESERVE_VMEMMAP_NR 1U -#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) - -enum vmemmap_optimize_mode { - VMEMMAP_OPTIMIZE_OFF, - VMEMMAP_OPTIMIZE_ON, +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; }; -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); -EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *page = pmd_page(*pmd); + pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(page + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } -static enum vmemmap_optimize_mode vmemmap_optimize_mode = - IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} -static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { - if (vmemmap_optimize_mode == to) - return; + int leaf; - if (to == VMEMMAP_OPTIMIZE_OFF) - static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - static_branch_inc(&hugetlb_optimize_vmemmap_key); - WRITE_ONCE(vmemmap_optimize_mode, to); + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } } -static int __init hugetlb_vmemmap_early_param(char *buf) +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) { - bool enable; - enum vmemmap_optimize_mode mode; + pmd_t *pmd; + unsigned long next; - if (kstrtobool(buf, &enable)) - return -EINVAL; + pmd = pmd_offset(pud, addr); + do { + int ret; - mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; - vmemmap_optimize_mode_switch(mode); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; + + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); return 0; } -early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); /* - * Previously discarded vmemmap pages will be allocated and remapping - * after this function returns zero. + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). */ -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); + +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); +core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); + +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { int ret; - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * The pages which the vmemmap virtual address range [@vmemmap_addr, + * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); if (!ret) { ClearHPageVmemmapOptimized(head); @@ -97,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } -static unsigned int vmemmap_optimizable_pages(struct hstate *h, - struct page *head) +/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ +static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return 0; + if (!READ_ONCE(vmemmap_optimize_enabled)) + return false; + + if (!hugetlb_vmemmap_optimizable(h)) + return false; if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { pmd_t *pmdp, pmd; @@ -144,118 +506,73 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h, * +-------------------------------------------+ */ if (PageVmemmapSelfHosted(vmemmap_page)) - return 0; + return false; } - return hugetlb_optimize_vmemmap_pages(h); + return true; } -void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; - vmemmap_pages = vmemmap_optimizable_pages(h, head); - if (!vmemmap_pages) + if (!vmemmap_should_optimize(h, head)) return; static_branch_inc(&hugetlb_optimize_vmemmap_key); - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. + * which the range [@vmemmap_start, @vmemmap_end] is mapped to. */ - if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); } -void __init hugetlb_vmemmap_init(struct hstate *h) -{ - unsigned int nr_pages = pages_per_huge_page(h); - unsigned int vmemmap_pages; - - /* - * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, - * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. - */ - BUILD_BUG_ON(__NR_USED_SUBPAGE >= - RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - - if (!is_power_of_2(sizeof(struct page))) { - pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); - static_branch_disable(&hugetlb_optimize_vmemmap_key); - return; - } - - vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; - /* - * The head page is not to be freed to buddy allocator, the other tail - * pages will map to the head page, so they can be freed. - * - * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true - * on some architectures (e.g. aarch64). See Documentation/arm64/ - * hugetlbpage.rst for more details. - */ - if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - - pr_info("can optimize %d vmemmap pages for %s\n", - h->optimize_vmemmap_pages, h->name); -} - -#ifdef CONFIG_PROC_SYSCTL -static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) -{ - int ret; - enum vmemmap_optimize_mode mode; - static DEFINE_MUTEX(sysctl_mutex); - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&sysctl_mutex); - mode = vmemmap_optimize_mode; - table->data = &mode; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (write && !ret) - vmemmap_optimize_mode_switch(mode); - mutex_unlock(&sysctl_mutex); - - return ret; -} - static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", - .maxlen = sizeof(enum vmemmap_optimize_mode), + .data = &vmemmap_optimize_enabled, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_optimize_vmemmap_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .proc_handler = proc_dobool, }, { } }; -static __init int hugetlb_vmemmap_sysctls_init(void) +static int __init hugetlb_vmemmap_init(void) { - /* - * If "struct page" crosses page boundaries, the vmemmap pages cannot - * be optimized. - */ - if (is_power_of_2(sizeof(struct page))) - register_sysctl_init("vm", hugetlb_vmemmap_sysctls); - + /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ + BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + + if (IS_ENABLED(CONFIG_PROC_SYSCTL)) { + const struct hstate *h; + + for_each_hstate(h) { + if (hugetlb_vmemmap_optimizable(h)) { + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + break; + } + } + } return 0; } -late_initcall(hugetlb_vmemmap_sysctls_init); -#endif /* CONFIG_PROC_SYSCTL */ +late_initcall(hugetlb_vmemmap_init); diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 109b0a53b6fe9..25bd0e0024314 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song */ @@ -11,35 +11,50 @@ #include #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); -void hugetlb_vmemmap_free(struct hstate *h, struct page *head); -void hugetlb_vmemmap_init(struct hstate *h); +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); /* - * How many vmemmap pages associated with a HugeTLB page that can be - * optimized and freed to the buddy allocator. + * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See + * Documentation/vm/vmemmap_dedup.rst. */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE + +static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { - return h->optimize_vmemmap_pages; + return pages_per_huge_page(h) * sizeof(struct page); +} + +/* + * Return how many vmemmap size associated with a HugeTLB page that can be + * optimized and can be freed to the buddy allocator. + */ +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) +{ + int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; + + if (!is_power_of_2(sizeof(struct page))) + return 0; + return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { return 0; } -static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -static inline void hugetlb_vmemmap_init(struct hstate *h) +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { + return 0; } +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { - return 0; + return hugetlb_vmemmap_optimizable_size(h) != 0; } -#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/internal.h b/mm/internal.h index b73385df867df..4df67b6b8cced 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -187,7 +187,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason /* * in mm/rmap.c: */ -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c @@ -864,4 +864,22 @@ DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); extern bool mirrored_kernelcore; +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) +{ + /* + * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty + * enablements, because when without soft-dirty being compiled in, + * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) + * will be constantly true. + */ + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return false; + + /* + * Soft-dirty is kind of special: its tracking is enabled when the + * vma flags not set. + */ + return !(vma->vm_flags & VM_SOFTDIRTY); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01f71786d5303..3e64105398c3d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, @@ -73,6 +74,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -85,6 +88,21 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ +#if defined(CONFIG_PPC64) + u32 node_load[MAX_NUMNODES]; +#elif HPAGE_PMD_ORDER < 16 + u16 node_load[MAX_NUMNODES]; +#else + u32 node_load[MAX_NUMNODES]; +#endif + /* Last target selected in hpage_collapse_find_target_node() */ + int last_target_node; +}; + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -425,7 +443,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } @@ -440,7 +458,7 @@ void __khugepaged_enter(struct mm_struct *mm) return; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return; @@ -466,7 +484,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } @@ -492,11 +510,10 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -546,11 +563,12 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -558,8 +576,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -579,11 +599,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -646,10 +669,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -658,19 +685,19 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -735,9 +762,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, + .last_target_node = NUMA_NO_NODE, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +779,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,146 +802,62 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == cc->node_load[nid]) { target_node = nid; break; } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) { - VM_BUG_ON_PAGE(*hpage, *hpage); - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - return true; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -920,7 +866,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -931,21 +878,60 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; - return 0; + return SCAN_SUCCEED; +} + +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -976,12 +962,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -991,30 +978,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + /* Only allocate from the target node */ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE) | __GFP_THISNODE; + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1022,40 +1020,34 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1065,11 +1057,12 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1093,11 +1086,11 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1109,7 +1102,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1119,8 +1111,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1128,42 +1120,43 @@ static void collapse_huge_page(struct mm_struct *mm, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); - return; + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1173,19 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1203,8 +1196,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1234,27 +1229,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1289,31 +1287,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1322,7 +1327,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1400,12 +1405,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) return; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1420,8 +1426,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); @@ -1495,7 +1500,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) @@ -1539,8 +1544,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; /* * We need exclusive mmap_lock to retract page table. @@ -1558,7 +1562,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) + if (!hpage_collapse_test_exit(mm) && + !userfaultfd_wp(vma)) collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { @@ -1575,8 +1580,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @mm: process address space where collapse happens * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1593,13 +1597,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static int collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1610,20 +1612,9 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* * Ensure we have slots for all the pages in the range. This is @@ -1641,14 +1632,14 @@ static void collapse_file(struct mm_struct *mm, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1676,7 +1667,7 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1818,19 +1809,19 @@ static void collapse_file(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1841,21 +1832,21 @@ static void collapse_file(struct mm_struct *mm, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1877,11 +1868,11 @@ static void collapse_file(struct mm_struct *mm, index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1892,23 +1883,22 @@ static void collapse_file(struct mm_struct *mm, index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1947,19 +1937,23 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1970,14 +1964,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1995,11 +1991,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2028,20 +2024,21 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + result = collapse_file(mm, file, start, cc); } } /* TODO: tracepoints */ + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } @@ -2051,8 +2048,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2063,6 +2060,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2083,7 +2081,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) + if (likely(!hpage_collapse_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; @@ -2091,11 +2089,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; @@ -2109,9 +2107,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2123,19 +2122,29 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + *result = khugepaged_scan_file(mm, file, pgoff, + cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); } + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2151,7 +2160,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2185,19 +2194,16 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2209,14 +2215,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -2253,7 +2270,7 @@ static int khugepaged(void *none) set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } @@ -2352,3 +2369,120 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + /* TODO: Support file/shmem */ + if (!vma->anon_vma || !vma_is_anonymous(vma)) + return -EINVAL; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, &vma, cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, + cc); + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/ksm.c b/mm/ksm.c index 42ab153335a2d..2f315c69fa2c9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1134,6 +1134,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; + pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; @@ -1148,6 +1149,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); diff --git a/mm/madvise.c b/mm/madvise.c index 5f0f0948a50e4..f9e11b6c99165 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1057,6 +1058,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1150,6 +1153,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1167,13 +1171,15 @@ madvise_behavior_valid(int behavior) } static bool -process_madvise_behavior_valid(int behavior) +process_madvise_behavior_valid(int behavior, struct task_struct *task) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: return true; + case MADV_COLLAPSE: + return task == current || capable(CAP_SYS_ADMIN); default: return false; } @@ -1339,6 +1345,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. @@ -1450,7 +1457,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { + if (!process_madvise_behavior_valid(behavior, task)) { ret = -EINVAL; goto release_task; } diff --git a/mm/memblock.c b/mm/memblock.c index 5b835bc04b098..b5d3026979fcc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -29,6 +29,10 @@ # define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #endif +#ifndef INIT_MEMBLOCK_MEMORY_REGIONS +#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS +#endif + /** * DOC: memblock overview * @@ -55,9 +59,9 @@ * the allocator metadata. The "memory" and "reserved" types are nicely * wrapped with struct memblock. This structure is statically * initialized at build time. The region arrays are initially sized to - * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS - * for "reserved". The region array for "physmem" is initially sized to - * %INIT_PHYSMEM_REGIONS. + * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and + * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array + * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not @@ -102,7 +106,7 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; @@ -111,7 +115,7 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ - .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5bfb3eacd082..f7949b014883a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -626,7 +626,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) x = __this_cpu_add_return(stats_updates, abs(val)); if (x > MEMCG_CHARGE_BATCH) { - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + /* + * If stats_flush_threshold exceeds the threshold + * (>num_online_cpus()), cgroup stats update will be triggered + * in __mem_cgroup_flush_stats(). Increasing this var further + * is redundant and simply adds overhead in atomic update. + */ + if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); __this_cpu_write(stats_updates, 0); } } @@ -1483,14 +1490,12 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; -static char *memory_stat_format(struct mem_cgroup *memcg) +static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; int i; - seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); - if (!s.buffer) - return NULL; + seq_buf_init(&s, buf, bufsize); /* * Provide statistics on the state of the memory subsystem as @@ -1532,8 +1537,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg) /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); - - return s.buffer; } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -1569,7 +1572,10 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - char *buf; + /* Use static buffer, for the caller is holding oom_lock. */ + static char buf[PAGE_SIZE]; + + lockdep_assert_held(&oom_lock); pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), @@ -1590,11 +1596,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - buf = memory_stat_format(memcg); - if (!buf) - return; + memory_stat_format(memcg, buf, sizeof(buf)); pr_info("%s", buf); - kfree(buf); } /* @@ -2330,7 +2333,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2575,8 +2579,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - bool may_swap = true; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; + bool raised_max_event = false; unsigned long pflags; retry: @@ -2592,7 +2597,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); - may_swap = false; + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } if (batch > nr_pages) { @@ -2616,10 +2621,11 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, goto nomem; memcg_memory_event(mem_over_limit, MEMCG_MAX); + raised_max_event = true; psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, may_swap); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -2682,6 +2688,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: + /* + * If the allocation has to be enforced, don't forget to raise + * a MEMCG_MAX event. + */ + if (!raised_max_event) + memcg_memory_event(mem_over_limit, MEMCG_MAX); + /* * The allocation either can't fail or will lead to more memory * being freed very soon. Allow memory usage go over the limit @@ -3430,8 +3443,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, continue; } - if (!try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, !memsw)) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3541,7 +3554,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -5175,6 +5189,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { error = memcg->id.id; + if (error == -ENOSPC) + pr_notice_ratelimited("mem_cgroup_id space is exhausted\n"); goto fail; } @@ -6293,7 +6309,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6342,7 +6358,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, true)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6387,11 +6403,11 @@ static int memory_events_local_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - char *buf; + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - buf = memory_stat_format(memcg); if (!buf) return -ENOMEM; + memory_stat_format(memcg, buf, PAGE_SIZE); seq_puts(m, buf); kfree(buf); return 0; @@ -6471,6 +6487,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; int err; buf = strstrip(buf); @@ -6478,6 +6495,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (err) return err; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6494,7 +6512,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3f7083eafd4b6..f0e1961d4482a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -33,6 +33,9 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ + +#define pr_fmt(fmt) "Memory failure: " fmt + #include #include #include @@ -71,7 +74,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -static bool __page_handle_poison(struct page *page) +/* + * Return values: + * 1: the page is dissolved (if needed) and taken off from buddy, + * 0: the page is dissolved (if needed) and not taken off from buddy, + * < 0: failed to dissolve. + */ +static int __page_handle_poison(struct page *page) { int ret; @@ -81,7 +90,7 @@ static bool __page_handle_poison(struct page *page) ret = take_page_off_buddy(page); zone_pcp_enable(page_zone(page)); - return ret > 0; + return ret; } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) @@ -91,7 +100,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo * Doing this check for free pages is also fine since dissolve_free_huge_page * returns 0 for non-hugetlb pages as well. */ - if (!__page_handle_poison(page)) + if (__page_handle_poison(page) <= 0) /* * We could fail to take off the target page from buddy * for example due to racy page allocation, but that's @@ -252,7 +261,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) short addr_lsb = tk->size_shift; int ret = 0; - pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); if ((flags & MF_ACTION_REQUIRED) && (t == current)) @@ -270,7 +279,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ if (ret < 0) - pr_info("Memory failure: Error sending signal to %s:%d: %d\n", + pr_info("Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); return ret; } @@ -352,7 +361,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - pr_err("Memory failure: Out of memory while machine check handling\n"); + pr_err("Out of memory while machine check handling\n"); return; } @@ -379,7 +388,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * has a mapping for the page. */ if (tk->addr == -EFAULT) { - pr_info("Memory failure: Unable to find user space address %lx in %s\n", + pr_info("Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); } else if (tk->size_shift == 0) { kfree(tk); @@ -412,7 +421,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * signal and then access the memory. Just kill it. */ if (fail || tk->addr == -EFAULT) { - pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); do_send_sig_info(SIGKILL, SEND_SIG_PRIV, tk->tsk, PIDTYPE_PID); @@ -425,7 +434,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * process anyways. */ else if (kill_proc(tk, pfn, flags) < 0) - pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", + pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); @@ -759,7 +768,6 @@ static const char * const action_page_types[] = { [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", - [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -816,12 +824,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - pr_info("Memory failure: %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("%#lx: Failed to punch page: %d\n", pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_info("Memory failure: %#lx: failed to release buffers\n", - pfn); + pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; } @@ -833,8 +839,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - pr_info("Memory failure: %#lx: Failed to invalidate\n", - pfn); + pr_info("%#lx: Failed to invalidate\n", pfn); } return ret; @@ -864,7 +869,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p, count -= 1; if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + pr_err("%#lx: %s still referenced by %d users\n", page_to_pfn(p), action_page_types[ps->type], count); return true; } @@ -888,7 +893,7 @@ static int me_kernel(struct page_state *ps, struct page *p) */ static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); + pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -1078,7 +1083,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { - res = MF_FAILED; unlock_page(hpage); /* * migration entry prevents later access on error hugepage, @@ -1086,9 +1090,11 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ put_page(hpage); - if (__page_handle_poison(p)) { + if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else { + res = MF_FAILED; } } @@ -1173,7 +1179,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(); - pr_err("Memory failure: %#lx: recovery action for %s: %s\n", + pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -1248,8 +1254,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) if (head == compound_head(page)) return 1; - pr_info("Memory failure: %#lx cannot catch tail\n", - page_to_pfn(page)); + pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); put_page(head); } @@ -1312,7 +1317,7 @@ static int get_any_page(struct page *p, unsigned long flags) } out: if (ret == -EIO) - pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p)); + pr_err("%#lx: unhandlable page.\n", page_to_pfn(p)); return ret; } @@ -1411,13 +1416,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return true; if (PageKsm(p)) { - pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); + pr_err("%#lx: can't handle KSM pages.\n", pfn); return false; } if (PageSwapCache(p)) { - pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n", - pfn); + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -1435,7 +1439,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -1464,14 +1468,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); } else - pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); } else { try_to_unmap(folio, ttu); } unmap_success = !page_mapped(hpage); if (!unmap_success) - pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", + pr_err("%#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); /* @@ -1664,6 +1668,113 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ +#ifdef CONFIG_HUGETLB_PAGE +/* + * Struct raw_hwp_page represents information about "raw error page", + * constructing singly linked list originated from ->private field of + * SUBPAGE_INDEX_HWPOISON-th tail page. + */ +struct raw_hwp_page { + struct llist_node node; + struct page *page; +}; + +static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +{ + return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); +} + +static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) +{ + struct llist_head *head; + struct llist_node *t, *tnode; + unsigned long count = 0; + + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + if (move_flag) + SetPageHWPoison(p->page); + kfree(p); + count++; + } + llist_del_all(head); + return count; +} + +static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +{ + struct llist_head *head; + struct raw_hwp_page *raw_hwp; + struct llist_node *t, *tnode; + int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + + /* + * Once the hwpoison hugepage has lost reliable raw error info, + * there is little meaning to keep additional error info precisely, + * so skip to add additional raw error info. + */ + if (HPageRawHwpUnreliable(hpage)) + return -EHWPOISON; + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + if (p->page == page) + return -EHWPOISON; + } + + raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); + if (raw_hwp) { + raw_hwp->page = page; + llist_add(&raw_hwp->node, head); + /* the first error event will be counted in action_result(). */ + if (ret) + num_poisoned_pages_inc(); + } else { + /* + * Failed to save raw error info. We no longer trace all + * hwpoisoned subpages, and we need refuse to free/dissolve + * this hwpoisoned hugepage. + */ + SetHPageRawHwpUnreliable(hpage); + /* + * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * used any more, so free it. + */ + __free_raw_hwp_pages(hpage, false); + } + return ret; +} + +static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +{ + /* + * HPageVmemmapOptimized hugepages can't be freed because struct + * pages for tail pages are required but they don't exist. + */ + if (move_flag && HPageVmemmapOptimized(hpage)) + return 0; + + /* + * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * definition. + */ + if (HPageRawHwpUnreliable(hpage)) + return 0; + + return __free_raw_hwp_pages(hpage, move_flag); +} + +void hugetlb_clear_page_hwpoison(struct page *hpage) +{ + if (HPageRawHwpUnreliable(hpage)) + return; + ClearPageHWPoison(hpage); + free_raw_hwp_pages(hpage, true); +} + /* * Called from hugetlb code with hugetlb_lock held. * @@ -1695,10 +1806,11 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) count_increased = true; } else { ret = -EBUSY; - goto out; + if (!(flags & MF_NO_RETRY)) + goto out; } - if (TestSetPageHWPoison(head)) { + if (hugetlb_set_page_hwpoison(head, page)) { ret = -EHWPOISON; goto out; } @@ -1710,7 +1822,6 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) return ret; } -#ifdef CONFIG_HUGETLB_PAGE /* * Taking refcount of hugetlb pages needs extra care about race conditions * with basic operations like hugepage allocation/free/demotion. @@ -1723,7 +1834,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; - bool retry = true; *hugetlb = 1; retry: @@ -1732,15 +1842,15 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); } return res; } else if (res == -EBUSY) { - if (retry) { - retry = false; + if (!(flags & MF_NO_RETRY)) { + flags |= MF_NO_RETRY; goto retry; } action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); @@ -1751,7 +1861,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb lock_page(head); if (hwpoison_filter(p)) { - ClearPageHWPoison(head); + hugetlb_clear_page_hwpoison(head); res = -EOPNOTSUPP; goto out; } @@ -1762,10 +1872,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb */ if (res == 0) { unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p)) { + if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else { + res = MF_FAILED; } action_result(pfn, MF_MSG_FREE_HUGE, res); return res == MF_RECOVERED ? 0 : -EBUSY; @@ -1773,21 +1884,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb page_flags = head->flags; - /* - * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so - * simply disable it. In order to make it work properly, we need - * make sure that: - * - conversion of a pud that maps an error hugetlb into hwpoison - * entry properly works, and - * - other mm code walking over page table is aware of pud-aligned - * hwpoison entries. - */ - if (huge_page_size(page_hstate(head)) > PMD_SIZE) { - action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); - res = -EBUSY; - goto out; - } - if (!hwpoison_user_mappings(p, pfn, flags, head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; @@ -1806,6 +1902,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int * return 0; } +static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +{ + return 0; +} #endif /* CONFIG_HUGETLB_PAGE */ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, @@ -1901,8 +2001,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; } } - pr_err("Memory failure: %#lx: memory outside kernel control\n", - pfn); + pr_err("%#lx: memory outside kernel control\n", pfn); res = -ENXIO; goto unlock_mutex; } @@ -1913,8 +2012,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); + pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2130,7 +2228,7 @@ void memory_failure_queue(unsigned long pfn, int flags) if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else - pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", + pr_err("buffer overflow when queuing memory failure at %#lx\n", pfn); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); put_cpu_var(memory_failure_cpu); @@ -2211,6 +2309,7 @@ int unpoison_memory(unsigned long pfn) struct page *p; int ret = -EBUSY; int freeit = 0; + unsigned long count = 1; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2258,6 +2357,13 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + goto unlock_mutex; + } + } ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { @@ -2266,6 +2372,13 @@ int unpoison_memory(unsigned long pfn) unpoison_pr_info("Unpoison: failed to grab page %#lx\n", pfn, &unpoison_rs); } else { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + goto unlock_mutex; + } + } freeit = !!TestClearPageHWPoison(p); put_page(page); @@ -2278,7 +2391,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_dec(); + num_poisoned_pages_sub(count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } diff --git a/mm/memory.c b/mm/memory.c index a29b5599cef94..bd8e7e79be99e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -744,7 +744,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Currently device exclusive access only supports anonymous * memory so the entry shouldn't point to a filebacked page. */ - WARN_ON_ONCE(!PageAnon(page)); + WARN_ON_ONCE(1); set_pte_at(vma->vm_mm, address, ptep, pte); @@ -4442,10 +4442,6 @@ late_initcall(fault_around_debugfs); * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * - * This function is called with the page table lock taken. In the split ptlock - * case the page table lock only protects only those entries which belong to - * the page table corresponding to the fault address. - * * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * @@ -4989,7 +4985,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5023,7 +5019,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 99ecb2b3ff53e..fad6d1f2262af 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -932,7 +932,7 @@ static struct zone *auto_movable_zone_for_pfn(int nid, if (!page) continue; /* If anything is !MOVABLE online the rest !MOVABLE. */ - if (page_zonenum(page) != ZONE_MOVABLE) + if (!is_zone_movable_page(page)) goto kernel_zone; online_pages += PAGES_PER_SECTION; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 88a5173c6ff07..b73d3248d976a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -465,9 +465,8 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, } page = pmd_page(*pmd); if (is_huge_zero_page(page)) { - spin_unlock(ptl); walk->action = ACTION_CONTINUE; - goto out; + goto unlock; } if (!queue_pages_required(page, qp)) goto unlock; @@ -484,7 +483,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; unlock: spin_unlock(ptl); -out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index d529837bc8c3b..c035020d0c896 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1647,7 +1647,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) return 0; /* Do we need to track softdirty? */ - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + if (vma_soft_dirty_enabled(vma)) return 1; /* Specialty mapping? */ @@ -1847,7 +1847,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: diff --git a/mm/mprotect.c b/mm/mprotect.c index 8250c1315d9c7..3a23dde73723b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -49,7 +49,7 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, return false; /* Do we need write faults for softdirty tracking? */ - if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte)) + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; /* Do we need write faults for uffd-wp tracking? */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4430235f3536..cfaf926bebcee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,12 +4065,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4175,7 +4175,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; @@ -4805,12 +4805,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -5003,14 +5003,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; @@ -6786,13 +6778,18 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores compound_mapcount_ptr(), + * compound_order() and compound_pincount_ptr(). Call + * prep_compound_head() after the first tail page have + * been initialized to not have the data overwritten. + * + * Note the idea to make this right after we initialize + * the offending tail pages is trying to take advantage + * of the likelihood of those tail struct pages being + * cached given that we will read them right after in + * prep_compound_head(). */ - if (pfn == head_pfn + 2) + if (unlikely(pfn == head_pfn + 1)) prep_compound_head(head, order); } } diff --git a/mm/rmap.c b/mm/rmap.c index edc06c52bc82e..af775855e58f0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -767,13 +767,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return vma_address(page, vma); } +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; - pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -788,15 +792,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - /* - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() - * without holding anon_vma lock for write. So when looking for a - * genuine pmde (in which to find pte), test present and !THP together. - */ - pmde = *pmd; - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - pmd = NULL; out: return pmd; } diff --git a/mm/shmem.c b/mm/shmem.c index fcab2be46f721..8baf26eda9891 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1057,6 +1058,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); if (shmem_is_huge(NULL, inode, 0)) @@ -2271,7 +2281,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +/* Mask out flags that are inappropriate for the given type of inode. */ +static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & SHMEM_REG_FLMASK; + else + return flags & SHMEM_OTHER_FLMASK; +} + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; @@ -2296,6 +2317,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + info->fsflags = shmem_mask_flags(mode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -3137,6 +3161,40 @@ static const char *shmem_get_link(struct dentry *dentry, } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); + if (info->fsflags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (info->fsflags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + + inode->i_ctime = current_time(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs @@ -3824,6 +3882,8 @@ static const struct inode_operations shmem_inode_operations = { #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; @@ -3843,6 +3903,8 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e5b40c43221d0..b05295bab3222 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -251,6 +251,7 @@ void shrinker_debugfs_remove(struct shrinker *shrinker) lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); + shrinker->name = NULL; if (!shrinker->debugfs_entry) return; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 5f0ed4717ed06..46ae542118c09 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,408 +27,9 @@ #include #include #include -#include -#include #include #include -#include - -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -/** - * struct vmemmap_remap_walk - walk vmemmap page table - * - * @remap_pte: called for each lowest-level entry (PTE). - * @nr_walked: the number of walked pte. - * @reuse_page: the page which is reused for the tail vmemmap pages. - * @reuse_addr: the virtual address of the @reuse_page page. - * @vmemmap_pages: the list head of the vmemmap pages that can be freed - * or is mapped from. - */ -struct vmemmap_remap_walk { - void (*remap_pte)(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk); - unsigned long nr_walked; - struct page *reuse_page; - unsigned long reuse_addr; - struct list_head *vmemmap_pages; -}; - -static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - pmd_t __pmd; - int i; - unsigned long addr = start; - struct page *page = pmd_page(*pmd); - pte_t *pgtable = pte_alloc_one_kernel(&init_mm); - - if (!pgtable) - return -ENOMEM; - - pmd_populate_kernel(&init_mm, &__pmd, pgtable); - - for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { - pte_t entry, *pte; - pgprot_t pgprot = PAGE_KERNEL; - - entry = mk_pte(page + i, pgprot); - pte = pte_offset_kernel(&__pmd, addr); - set_pte_at(&init_mm, addr, pte, entry); - } - - spin_lock(&init_mm.page_table_lock); - if (likely(pmd_leaf(*pmd))) { - /* - * Higher order allocations from buddy allocator must be able to - * be treated as indepdenent small pages (as they can be freed - * individually). - */ - if (!PageReserved(page)) - split_page(page, get_order(PMD_SIZE)); - - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); - } else { - pte_free_kernel(&init_mm, pgtable); - } - spin_unlock(&init_mm.page_table_lock); - - return 0; -} - -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - int leaf; - - spin_lock(&init_mm.page_table_lock); - leaf = pmd_leaf(*pmd); - spin_unlock(&init_mm.page_table_lock); - - if (!leaf) - return 0; - - return __split_vmemmap_huge_pmd(pmd, start); -} - -static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pte_t *pte = pte_offset_kernel(pmd, addr); - - /* - * The reuse_page is found 'first' in table walk before we start - * remapping (which is calling @walk->remap_pte). - */ - if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); - /* - * Because the reuse address is part of the range that we are - * walking, skip the reuse address range. - */ - addr += PAGE_SIZE; - pte++; - walk->nr_walked++; - } - - for (; addr != end; addr += PAGE_SIZE, pte++) { - walk->remap_pte(pte, addr, walk); - walk->nr_walked++; - } -} - -static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - int ret; - - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); - if (ret) - return ret; - - next = pmd_addr_end(addr, end); - vmemmap_pte_range(pmd, addr, next, walk); - } while (pmd++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(p4d, addr); - do { - int ret; - - next = pud_addr_end(addr, end); - ret = vmemmap_pmd_range(pud, addr, next, walk); - if (ret) - return ret; - } while (pud++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_offset(pgd, addr); - do { - int ret; - - next = p4d_addr_end(addr, end); - ret = vmemmap_pud_range(p4d, addr, next, walk); - if (ret) - return ret; - } while (p4d++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_remap_range(unsigned long start, unsigned long end, - struct vmemmap_remap_walk *walk) -{ - unsigned long addr = start; - unsigned long next; - pgd_t *pgd; - - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); - - pgd = pgd_offset_k(addr); - do { - int ret; - - next = pgd_addr_end(addr, end); - ret = vmemmap_p4d_range(pgd, addr, next, walk); - if (ret) - return ret; - } while (pgd++, addr = next, addr != end); - - /* - * We only change the mapping of the vmemmap virtual address range - * [@start + PAGE_SIZE, end), so we only need to flush the TLB which - * belongs to the range. - */ - flush_tlb_kernel_range(start + PAGE_SIZE, end); - - return 0; -} - -/* - * Free a vmemmap page. A vmemmap page can be allocated from the memblock - * allocator or buddy allocator. If the PG_reserved flag is set, it means - * that it allocated from the memblock allocator, just free it via the - * free_bootmem_page(). Otherwise, use __free_page(). - */ -static inline void free_vmemmap_page(struct page *page) -{ - if (PageReserved(page)) - free_bootmem_page(page); - else - __free_page(page); -} - -/* Free a list of the vmemmap pages */ -static void free_vmemmap_page_list(struct list_head *list) -{ - struct page *page, *next; - - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); - free_vmemmap_page(page); - } -} - -static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk) -{ - /* - * Remap the tail pages as read-only to catch illegal write operation - * to the tail pages. - */ - pgprot_t pgprot = PAGE_KERNEL_RO; - pte_t entry = mk_pte(walk->reuse_page, pgprot); - struct page *page = pte_page(*pte); - - list_add_tail(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); -} - -/* - * How many struct page structs need to be reset. When we reuse the head - * struct page, the special metadata (e.g. page->flags or page->mapping) - * cannot copy to the tail struct page structs. The invalid value will be - * checked in the free_tail_pages_check(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page - * structs. - */ -#define NR_RESET_STRUCT_PAGE 3 - -static inline void reset_struct_pages(struct page *start) -{ - int i; - struct page *from = start + NR_RESET_STRUCT_PAGE; - - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); -} - -static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk) -{ - pgprot_t pgprot = PAGE_KERNEL; - struct page *page; - void *to; - - BUG_ON(pte_page(*pte) != walk->reuse_page); - - page = list_first_entry(walk->vmemmap_pages, struct page, lru); - list_del(&page->lru); - to = page_to_virt(page); - copy_page(to, (void *)walk->reuse_addr); - reset_struct_pages(to); - - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); -} - -/** - * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) - * to the page which @reuse is mapped to, then free vmemmap - * which the range are mapped to. - * @start: start address of the vmemmap virtual address range that we want - * to remap. - * @end: end address of the vmemmap virtual address range that we want to - * remap. - * @reuse: reuse address. - * - * Return: %0 on success, negative error code otherwise. - */ -int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse) -{ - int ret; - LIST_HEAD(vmemmap_pages); - struct vmemmap_remap_walk walk = { - .remap_pte = vmemmap_remap_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - /* - * In order to make remapping routine most efficient for the huge pages, - * the routine of vmemmap page table walking has the following rules - * (see more details from the vmemmap_pte_range()): - * - * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) - * should be continuous. - * - The @reuse address is part of the range [@reuse, @end) that we are - * walking which is passed to vmemmap_remap_range(). - * - The @reuse address is the first in the complete range. - * - * So we need to make sure that @start and @reuse meet the above rules. - */ - BUG_ON(start - reuse != PAGE_SIZE); - - mmap_read_lock(&init_mm); - ret = vmemmap_remap_range(reuse, end, &walk); - if (ret && walk.nr_walked) { - end = reuse + walk.nr_walked * PAGE_SIZE; - /* - * vmemmap_pages contains pages from the previous - * vmemmap_remap_range call which failed. These - * are pages which were removed from the vmemmap. - * They will be restored in the following call. - */ - walk = (struct vmemmap_remap_walk) { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - vmemmap_remap_range(reuse, end, &walk); - } - mmap_read_unlock(&init_mm); - - free_vmemmap_page_list(&vmemmap_pages); - - return ret; -} - -static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, - gfp_t gfp_mask, struct list_head *list) -{ - unsigned long nr_pages = (end - start) >> PAGE_SHIFT; - int nid = page_to_nid((struct page *)start); - struct page *page, *next; - - while (nr_pages--) { - page = alloc_pages_node(nid, gfp_mask, 0); - if (!page) - goto out; - list_add_tail(&page->lru, list); - } - - return 0; -out: - list_for_each_entry_safe(page, next, list, lru) - __free_pages(page, 0); - return -ENOMEM; -} - -/** - * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) - * to the page which is from the @vmemmap_pages - * respectively. - * @start: start address of the vmemmap virtual address range that we want - * to remap. - * @end: end address of the vmemmap virtual address range that we want to - * remap. - * @reuse: reuse address. - * @gfp_mask: GFP flag for allocating vmemmap pages. - * - * Return: %0 on success, negative error code otherwise. - */ -int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask) -{ - LIST_HEAD(vmemmap_pages); - struct vmemmap_remap_walk walk = { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - - if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) - return -ENOMEM; - - mmap_read_lock(&init_mm); - vmemmap_remap_range(reuse, end, &walk); - mmap_read_unlock(&init_mm); - - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map diff --git a/mm/vmscan.c b/mm/vmscan.c index 29846d10209ad..b2b1431352dcd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -644,8 +647,10 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __prealloc_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } @@ -660,6 +665,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) { #ifdef CONFIG_SHRINKER_DEBUG kfree_const(shrinker->name); + shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); @@ -704,8 +710,10 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __register_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } #else @@ -3175,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->priority); /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -3300,9 +3309,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -3584,8 +3594,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); @@ -3875,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3888,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = may_swap, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a7bad79cbbed8..34f784a1604b7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -386,7 +386,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); - return *handle ? 0 : -1; + + if (IS_ERR((void *)(*handle))) + return PTR_ERR((void *)*handle); + return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { @@ -1388,7 +1391,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, - * otherwise 0. + * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) @@ -1399,11 +1402,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; + return (unsigned long)ERR_PTR(-EINVAL); handle = cache_alloc_handle(pool, gfp); if (!handle) - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; @@ -1428,7 +1431,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); } spin_lock(&class->lock); diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h index b238dbc9eb858..56eec4445bc9e 100644 --- a/tools/include/linux/gfp.h +++ b/tools/include/linux/gfp.h @@ -12,7 +12,6 @@ #define __GFP_FS 0x80u #define __GFP_NOWARN 0x200u #define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u #define __GFP_ACCOUNT 0x100000u #define __GFP_DIRECT_RECLAIM 0x400000u #define __GFP_KSWAPD_RECLAIM 0x2000000u @@ -20,7 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM) #define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH | __GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e44..6ce1f1ceb432c 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index ebfab2ca17024..4a06d83f2ac5a 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 716b62c05e3d3..939a33dc5dc6b 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1603,9 +1603,19 @@ TEST_F(hmm2, double_map) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); hmm_buffer_free(buffer); } diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 585978f181ed1..e63a0214f6399 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -107,7 +107,7 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { - size_t length; + size_t length = 0; if (argc != 2 && argc != 3) { printf("Usage: %s [length_in_MB] \n", argv[0]); diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index 6c6af40f57478..3c9943131881e 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -89,10 +89,11 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i, tmp; + unsigned long dummy = 0; + unsigned long i; for (i = 0; i < nr_pages; i++) - tmp += *((unsigned long *)(addr + (i * huge_page_size))); + dummy += *((unsigned long *)(addr + (i * huge_page_size))); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 155120b67a165..b77b1e28cdb38 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -14,6 +14,9 @@ #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -23,6 +26,11 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); + bool enforce_pte_scan_limits; +}; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -90,18 +98,6 @@ struct settings { struct khugepaged_settings khugepaged; }; -static struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, -}; - static struct settings saved_settings; static bool skip_settings_restore; @@ -279,6 +275,39 @@ static void write_settings(struct settings *settings) write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); } +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + static void restore_settings(int sig) { if (skip_settings_restore) @@ -322,14 +351,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static void adjust_settings(void) -{ - - printf("Adjust settings..."); - write_settings(&default_settings); - success("OK"); -} - #define MAX_LINE_LENGTH 500 static bool check_for_pattern(FILE *fp, char *pattern, char *buf) @@ -341,7 +362,7 @@ static bool check_for_pattern(FILE *fp, char *pattern, char *buf) return false; } -static bool check_huge(void *addr) +static bool check_huge(void *addr, int nr_hpages) { bool thp = false; int ret; @@ -366,7 +387,7 @@ static bool check_huge(void *addr) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - hpage_pmd_size >> 10); + nr_hpages * (hpage_pmd_size >> 10)); if (ret >= MAX_LINE_LENGTH) { printf("%s: Pattern is too long\n", __func__); exit(EXIT_FAILURE); @@ -434,12 +455,12 @@ static bool check_swap(void *addr, unsigned long size) return swap; } -static void *alloc_mapping(void) +static void *alloc_mapping(int nr) { void *p; - p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (p != BASE_ADDR) { printf("Failed to allocate VMA at %p\n", BASE_ADDR); exit(EXIT_FAILURE); @@ -456,6 +477,25 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(void) +{ + void *p; + + p = alloc_mapping(1); + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + return p; +} + static void validate_memory(int *p, unsigned long start, unsigned long end) { int i; @@ -469,26 +509,59 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + /* Sanity check */ + if (!check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (check_huge(p, nr_hpages) != expect) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (check_huge(p)) { + if (!check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { - if (check_huge(p)) + if (check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -496,121 +569,121 @@ static bool wait_for_scan(const char *msg, char *p) usleep(TICK); } - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); return timeout == -1; } +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } else if (check_huge(p, nr_hpages) == expect) { + success("OK"); + } else { + fail("Fail"); + } +} + static void alloc_at_fault(void) { - struct settings settings = default_settings; + struct settings settings = *current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); - write_settings(&default_settings); + pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(void) +static void collapse_full(struct collapse_context *c) { void *p; - - p = alloc_mapping(); - fill_memory(p, 0, hpage_pmd_size); - if (wait_for_scan("Collapse fully populated PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + true); + validate_memory(p, 0, size); + munmap(p, size); } -static void collapse_empty(void) +static void collapse_empty(struct collapse_context *c) { void *p; - p = alloc_mapping(); - if (wait_for_scan("Do not collapse empty PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + p = alloc_mapping(1); + c->collapse("Do not collapse empty PTE table", p, 1, false); munmap(p, hpage_pmd_size); } -static void collapse_single_pte_entry(void) +static void collapse_single_pte_entry(struct collapse_context *c) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, page_size); - if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_none(void) +static void collapse_max_ptes_none(struct collapse_context *c) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = default_settings; + struct settings settings = *current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } munmap(p, hpage_pmd_size); - write_settings(&default_settings); + pop_settings(); } -static void collapse_swapin_single_pte(void) +static void collapse_swapin_single_pte(struct collapse_context *c) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout one page..."); @@ -625,23 +698,18 @@ static void collapse_swapin_single_pte(void) goto out; } - if (wait_for_scan("Collapse with swapping in single PTE entry", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse with swapping in single PTE entry", p, 1, true); validate_memory(p, 0, hpage_pmd_size); out: munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(void) +static void collapse_max_ptes_swap(struct collapse_context *c) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); @@ -656,115 +724,83 @@ static void collapse_max_ptes_swap(void) goto out; } - if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, + !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } - if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, true); + validate_memory(p, 0, hpage_pmd_size); + } out: munmap(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(void) +static void collapse_single_pte_entry_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); + p = alloc_hpage(); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_full_of_compound(void) +static void collapse_full_of_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages", p, 1, true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_compound_extreme(void) +static void collapse_compound_extreme(struct collapse_context *c) { void *p; int i; - p = alloc_mapping(); + p = alloc_mapping(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR)) { + if (!check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -793,32 +829,28 @@ static void collapse_compound_extreme(void) munmap(BASE_ADDR, hpage_pmd_size); fill_memory(p, 0, hpage_pmd_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of different compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of different compound pages", p, 1, + true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_fork(void) +static void collapse_fork(struct collapse_context *c) { int wstatus; void *p; - p = alloc_mapping(); + p = alloc_mapping(1); printf("Allocate small page..."); fill_memory(p, 0, page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -829,19 +861,14 @@ static void collapse_fork(void) skip_settings_restore = true; exit_status = 0; - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); fill_memory(p, page_size, 2 * page_size); - - if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); @@ -852,7 +879,7 @@ static void collapse_fork(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -860,28 +887,19 @@ static void collapse_fork(void) munmap(p, hpage_pmd_size); } -static void collapse_fork_compound(void) +static void collapse_fork_compound(struct collapse_context *c) { int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -889,21 +907,17 @@ static void collapse_fork_compound(void) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); fill_memory(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, true); write_num("khugepaged/max_ptes_shared", - default_settings.khugepaged.max_ptes_shared); + current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -914,7 +928,7 @@ static void collapse_fork_compound(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -922,29 +936,20 @@ static void collapse_fork_compound(void) munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_shared() +static void collapse_max_ptes_shared(struct collapse_context *c) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -952,33 +957,27 @@ static void collapse_max_ptes_shared() printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) - fail("Timeout"); - else if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - - if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, true); + } validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -989,7 +988,7 @@ static void collapse_max_ptes_shared() exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -997,8 +996,52 @@ static void collapse_max_ptes_shared() munmap(p, hpage_pmd_size); } -int main(void) +static void madvise_collapse_existing_thps(void) { + void *p; + int err; + + p = alloc_mapping(1); + fill_memory(p, 0, hpage_pmd_size); + + printf("Collapse fully populated PTE table..."); + /* + * Note that we don't set MADV_HUGEPAGE here, which + * also tests that VM_HUGEPAGE isn't required for + * MADV_COLLAPSE in "madvise" mode. + */ + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p, 1)) { + success("OK"); + printf("Re-collapse PMD-mapped hugepage"); + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + } else { + fail("Fail"); + } + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(int argc, const char **argv) +{ + struct collapse_context c; + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + }; + const char *tests = argc == 1 ? "all" : argv[1]; + setbuf(stdout, NULL); page_size = getpagesize(); @@ -1011,21 +1054,47 @@ int main(void) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - adjust_settings(); + push_settings(&default_settings); alloc_at_fault(); - collapse_full(); - collapse_empty(); - collapse_single_pte_entry(); - collapse_max_ptes_none(); - collapse_swapin_single_pte(); - collapse_max_ptes_swap(); - collapse_single_pte_entry_compound(); - collapse_full_of_compound(); - collapse_compound_extreme(); - collapse_fork(); - collapse_fork_compound(); - collapse_max_ptes_shared(); + + if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { + printf("\n*** Testing context: khugepaged ***\n"); + c.collapse = &khugepaged_collapse; + c.enforce_pte_scan_limits = true; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + } + if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { + printf("\n*** Testing context: madvise ***\n"); + c.collapse = &madvise_collapse; + c.enforce_pte_scan_limits = false; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + madvise_collapse_existing_thps(); + } restore_settings(0); } diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c index 96671c2f7d485..6c62966ab5dbc 100644 --- a/tools/testing/selftests/vm/mrelease_test.c +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -62,19 +62,22 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) /* The process_mrelease calls in this test are expected to fail */ static void run_negative_tests(int pidfd) { + int res; /* Test invalid flags. Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong flags"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* * Test reaping while process is alive with no pending SIGKILL. * Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease on a live process"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } @@ -100,8 +103,9 @@ int main(void) /* Test a wrong pidfd */ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong pidfd"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* Start the test with 1MB child memory allocation */ @@ -156,8 +160,9 @@ int main(void) run_negative_tests(pidfd); if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("kill"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); @@ -172,9 +177,10 @@ int main(void) if (errno == ESRCH) { retry = (size <= MAX_SIZE_MB); } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease"); waitpid(pid, NULL, 0); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 2af563a9652e0..de86983b8a0f3 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -190,4 +190,6 @@ then run_test ./protection_keys_64 fi +run_test ./soft-dirty + exit $exitcode diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index 08ab62a4a9d07..e3a43f5d4fa2b 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -121,13 +121,76 @@ static void test_hugepage(int pagemap_fd, int pagesize) free(map); } +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + int main(int argc, char **argv) { int pagemap_fd; int pagesize; ksft_print_header(); - ksft_set_plan(5); + ksft_set_plan(15); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) @@ -138,6 +201,8 @@ int main(int argc, char **argv) test_simple(pagemap_fd, pagesize); test_vma_reuse(pagemap_fd, pagesize); test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); close(pagemap_fd); diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c index da6ec3b53ea8d..1d20689898839 100644 --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -231,7 +231,7 @@ static struct testcase hugetlb_testcases[] = { static int run_test(struct testcase *test, int count) { void *p; - int i, ret = 0; + int i, ret = KSFT_PASS; for (i = 0; i < count; i++) { struct testcase *t = test + i; @@ -242,13 +242,13 @@ static int run_test(struct testcase *test, int count) if (p == MAP_FAILED) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; continue; } if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; } else { /* * Do a dereference of the address returned so that we catch @@ -280,7 +280,7 @@ int main(int argc, char **argv) int ret; if (!supported_arch()) - return 0; + return KSFT_SKIP; ret = run_test(testcases, ARRAY_SIZE(testcases)); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 74c3dcecf64d9..ec2e67c85b849 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -470,23 +470,23 @@ static bool match_str_list(const char *str, char **list, int list_size) static bool is_need(char *buf) { - if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) - return false; - if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) - return false; - if ((filter & FILTER_TGID) && - !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) - return false; - - char *comm = get_comm(buf); - - if ((filter & FILTER_COMM) && - !match_str_list(comm, fc.comms, fc.comms_size)) { - free(comm); - return false; - } + if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { free(comm); - return true; + return false; + } + free(comm); + return true; } static void add_list(char *buf, int len, char *ext_buf)