diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index 8db67aa472f16..402af4b2b905f 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -182,3 +182,42 @@ Date: November 2021 Contact: Jarkko Sakkinen Description: The total amount of SGX physical memory in bytes. + +What: /sys/devices/system/node/nodeX/memory_failure/total +Date: January 2023 +Contact: Jiaqi Yan +Description: + The total number of raw poisoned pages (pages containing + corrupted data due to memory errors) on a NUMA node. + +What: /sys/devices/system/node/nodeX/memory_failure/ignored +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + ignored by memory error recovery attempt, usually because + support for this type of pages is unavailable, and kernel + gives up the recovery. + +What: /sys/devices/system/node/nodeX/memory_failure/failed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + failed by memory error recovery attempt. This usually means + a key recovery operation failed. + +What: /sys/devices/system/node/nodeX/memory_failure/delayed +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + delayed by memory error recovery attempt. Delayed poisoned + pages usually will be retried by kernel. + +What: /sys/devices/system/node/nodeX/memory_failure/recovered +Date: January 2023 +Contact: Jiaqi Yan +Description: + Of the raw poisoned pages on a NUMA node, how many pages are + recovered by memory error recovery attempt. diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 3d82ca6a17ffd..9237d6a258971 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -279,14 +279,25 @@ The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords that can be written to and read from the file and their meaning are as below. - - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD`` - - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` +Note that support of each action depends on the running DAMON operations set +`implementation `. + + - ``willneed``: Call ``madvise()`` for the region with ``MADV_WILLNEED``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``cold``: Call ``madvise()`` for the region with ``MADV_COLD``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + Supported by ``vaddr``, ``fvaddr`` and ``paddr`` operations set. + - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. + - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. + Supported by ``vaddr`` and ``fvaddr`` operations set. - ``lru_prio``: Prioritize the region on its LRU lists. + Supported by ``paddr`` operations set. - ``lru_deprio``: Deprioritize the region on its LRU lists. - - ``stat``: Do nothing but count the statistics + Supported by ``paddr`` operations set. + - ``stat``: Do nothing but count the statistics. + Supported by all operations sets. schemes//access_pattern/ --------------------------- @@ -388,8 +399,8 @@ pages of all memory cgroups except ``/having_care_already``.:: echo /having_care_already > 1/memcg_path echo N > 1/matching -Note that filters could be ignored depend on the running DAMON operations set -`implementation `. +Note that filters are currently supported only when ``paddr`` +`implementation ` is being used. .. _sysfs_schemes_stats: @@ -618,11 +629,15 @@ The ```` is a predefined integer for memory management actions, which DAMON will apply to the regions having the target access pattern. The supported numbers and their meanings are as below. - - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - - 1: Call ``madvise()`` for the region with ``MADV_COLD`` - - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``. Ignored if + ``target`` is ``paddr``. + - 1: Call ``madvise()`` for the region with ``MADV_COLD``. Ignored if + ``target`` is ``paddr``. + - 2: Call ``madvise()`` for the region with ``MADV_PAGEOUT``. + - 3: Call ``madvise()`` for the region with ``MADV_HUGEPAGE``. Ignored if + ``target`` is ``paddr``. + - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``. Ignored if + ``target`` is ``paddr``. - 5: Do nothing but count the statistics Quota diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index fb6ba2002a4b2..f160f9487a90f 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages +zero_pages_sharing + how many empty pages are sharing kernel zero page(s) instead of + with each other as it would happen normally. Only effective when + enabling ``use_zero_pages`` knob. + +When enabling ``use_zero_pages``, the sum of ``pages_sharing`` + +``zero_pages_sharing`` represents how much really saved by KSM. A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 129c2843e4f08..7a9ef7143e91d 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -453,9 +453,10 @@ this allows system administrators to override the kexec_load_disabled =================== -A toggle indicating if the ``kexec_load`` syscall has been disabled. -This value defaults to 0 (false: ``kexec_load`` enabled), but can be -set to 1 (true: ``kexec_load`` disabled). +A toggle indicating if the syscalls ``kexec_load`` and +``kexec_file_load`` have been disabled. +This value defaults to 0 (false: ``kexec_*load`` enabled), but can be +set to 1 (true: ``kexec_*load`` disabled). Once true, kexec can no longer be used, and the toggle cannot be set back to false. This allows a kexec image to be loaded before disabling the syscall, @@ -463,6 +464,24 @@ allowing a system to set up (and later use) an image without it being altered. Generally used together with the `modules_disabled`_ sysctl. +kexec_load_limit_panic +====================== + +This parameter specifies a limit to the number of times the syscalls +``kexec_load`` and ``kexec_file_load`` can be called with a crash +image. It can only be set with a more restrictive value than the +current one. + +== ====================================================== +-1 Unlimited calls to kexec. This is the default setting. +N Number of calls left. +== ====================================================== + +kexec_load_limit_reboot +======================= + +Similar functionality as ``kexec_load_limit_panic``, but for a normal +image. kptr_restrict ============= diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index facafbdecb952..9fb0b1080d3b8 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -55,18 +55,17 @@ flags the caller provides. The caller is required to pass in a non-null struct pages* array, and the function then pins pages by incrementing each by a special value: GUP_PIN_COUNTING_BIAS. -For compound pages, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, -an exact form of pin counting is achieved, by using the 2nd struct page -in the compound page. A new struct page field, compound_pincount, has -been added in order to support this. - -This approach for compound pages avoids the counting upper limit problems that -are discussed below. Those limitations would have been aggravated severely by -huge pages, because each tail page adds a refcount to the head page. And in -fact, testing revealed that, without a separate compound_pincount field, -page overflows were seen in some huge page stress tests. - -This also means that huge pages and compound pages do not suffer +For large folios, the GUP_PIN_COUNTING_BIAS scheme is not used. Instead, +the extra space available in the struct folio is used to store the +pincount directly. + +This approach for large folios avoids the counting upper limit problems +that are discussed below. Those limitations would have been aggravated +severely by huge pages, because each tail page adds a refcount to the +head page. And in fact, testing revealed that, without a separate pincount +field, refcount overflows were seen in some huge page stress tests. + +This also means that huge pages and large folios do not suffer from the false positives problem that is mentioned below.:: Function @@ -264,9 +263,9 @@ place.) Other diagnostics ================= -dump_page() has been enhanced slightly, to handle these new counting -fields, and to better report on compound pages in general. Specifically, -for compound pages, the exact (compound_pincount) pincount is reported. +dump_page() has been enhanced slightly to handle these new counting +fields, and to better report on large folios in general. Specifically, +for large folios, the exact pincount is reported. References ========== diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 5f6454b9dbd4d..08e420e109739 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -231,6 +231,71 @@ proc entries This feature is intended for systematic testing of faults in a single system call. See an example below. + +Error Injectable Functions +-------------------------- + +This part is for the kenrel developers considering to add a function to +ALLOW_ERROR_INJECTION() macro. + +Requirements for the Error Injectable Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since the function-level error injection forcibly changes the code path +and returns an error even if the input and conditions are proper, this can +cause unexpected kernel crash if you allow error injection on the function +which is NOT error injectable. Thus, you (and reviewers) must ensure; + +- The function returns an error code if it fails, and the callers must check + it correctly (need to recover from it). + +- The function does not execute any code which can change any state before + the first error return. The state includes global or local, or input + variable. For example, clear output address storage (e.g. `*ret = NULL`), + increments/decrements counter, set a flag, preempt/irq disable or get + a lock (if those are recovered before returning error, that will be OK.) + +The first requirement is important, and it will result in that the release +(free objects) functions are usually harder to inject errors than allocate +functions. If errors of such release functions are not correctly handled +it will cause a memory leak easily (the caller will confuse that the object +has been released or corrupted.) + +The second one is for the caller which expects the function should always +does something. Thus if the function error injection skips whole of the +function, the expectation is betrayed and causes an unexpected error. + +Type of the Error Injectable Functions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each error injectable functions will have the error type specified by the +ALLOW_ERROR_INJECTION() macro. You have to choose it carefully if you add +a new error injectable function. If the wrong error type is chosen, the +kernel may crash because it may not be able to handle the error. +There are 4 types of errors defined in include/asm-generic/error-injection.h + +EI_ETYPE_NULL + This function will return `NULL` if it fails. e.g. return an allocateed + object address. + +EI_ETYPE_ERRNO + This function will return an `-errno` error code if it fails. e.g. return + -EINVAL if the input is wrong. This will include the functions which will + return an address which encodes `-errno` by ERR_PTR() macro. + +EI_ETYPE_ERRNO_NULL + This function will return an `-errno` or `NULL` if it fails. If the caller + of this function checks the return value with IS_ERR_OR_NULL() macro, this + type will be appropriate. + +EI_ETYPE_TRUE + This function will return `true` (non-zero positive value) if it fails. + +If you specifies a wrong type, for example, EI_TYPE_ERRNO for the function +which returns an allocated object, it may cause a problem because the returned +value is not an object address and the caller can not access to the address. + + How to add new fault injection capability ----------------------------------------- diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 48c0bbff98b2f..5e0a505835005 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -4,8 +4,9 @@ DAMON: Data Access MONitor ========================== -DAMON is a data access monitoring framework subsystem for the Linux kernel. -The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it +DAMON is a Linux kernel subsystem that provides a framework for data access +monitoring and the monitoring results based system operations. The core +monitoring mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *accurate* (the monitoring output is useful enough for DRAM level memory management; It might not appropriate for CPU Cache levels, though), @@ -14,12 +15,16 @@ The core mechanisms of DAMON (refer to :doc:`design` for the detail) make it - *scalable* (the upper-bound of the overhead is in constant range regardless of the size of target workloads). -Using this framework, therefore, the kernel's memory management mechanisms can -make advanced decisions. Experimental memory management optimization works -that incurring high data accesses monitoring overhead could implemented again. -In user space, meanwhile, users who have some special workloads can write -personalized applications for better understanding and optimizations of their -workloads and systems. +Using this framework, therefore, the kernel can operate system in an +access-aware fashion. Because the features are also exposed to the user space, +users who have special information about their workloads can write personalized +applications for better understanding and optimizations of their workloads and +systems. + +For easier development of such systems, DAMON provides a feature called DAMOS +(DAMon-based Operation Schemes) in addition to the monitoring. Using the +feature, DAMON users in both kernel and user spaces can do access-aware system +operations with no code but simple configurations. .. toctree:: :maxdepth: 2 @@ -27,3 +32,4 @@ workloads and systems. faq design api + maintainer-profile diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst new file mode 100644 index 0000000000000..24a202f03de82 --- /dev/null +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +DAMON Maintainer Entry Profile +============================== + +The DAMON subsystem covers the files that listed in 'DATA ACCESS MONITOR' +section of 'MAINTAINERS' file. + +The mailing lists for the subsystem are damon@lists.linux.dev and +linux-mm@kvack.org. Patches should be made against the mm-unstable tree [1]_ +whenever possible and posted to the mailing lists. + +SCM Trees +--------- + +There are multiple Linux trees for DAMON development. Patches under +development or testing are queued in damon/next [2]_ by the DAMON maintainer. +Suffieicntly reviewed patches will be queued in mm-unstable [1]_ by the memory +management subsystem maintainer. After more sufficient tests, the patches will +be queued in mm-stable [3]_ , and finally pull-requested to the mainline by the +memory management subsystem maintainer. + +Note again the patches for review should be made against the mm-unstable +tree[1] whenever possible. damon/next is only for preview of others' works in +progress. + +Submit checklist addendum +------------------------- + +When making DAMON changes, you should do below. + +- Build changes related outputs including kernel and documents. +- Ensure the builds introduce no new errors or warnings. +- Run and ensure no new failures for DAMON selftests [4]_ and kunittests [5]_ . + +Further doing below and putting the results will be helpful. + +- Run damon-tests/corr [6]_ for normal changes. +- Run damon-tests/perf [7]_ for performance changes. + +Key cycle dates +--------------- + +Patches can be sent anytime. Key cycle dates of the mm-unstable[1] and +mm-stable[3] trees depend on the memory management subsystem maintainer. + +Review cadence +-------------- + +The DAMON maintainer does the work on the usual work hour (09:00 to 17:00, +Mon-Fri) in PST. The response to patches will occasionally be slow. Do not +hesitate to send a ping if you have not heard back within a week of sending a +patch. + + +.. [1] https://git.kernel.org/akpm/mm/h/mm-unstable +.. [2] https://git.kernel.org/sj/h/damon/next +.. [3] https://git.kernel.org/akpm/mm/h/mm-stable +.. [4] https://github.com/awslabs/damon-tests/blob/master/corr/run.sh#L49 +.. [5] https://github.com/awslabs/damon-tests/blob/master/corr/tests/kunit.sh +.. [6] https://github.com/awslabs/damon-tests/tree/master/corr +.. [7] https://github.com/awslabs/damon-tests/tree/master/perf diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index 0f731d9196b01..4503868b0865f 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -57,7 +57,8 @@ list shows them in order of preference of use. It can be invoked from any context (including interrupts) but the mappings can only be used in the context which acquired them. - This function should be preferred, where feasible, over all the others. + This function should always be used, whereas kmap_atomic() and kmap() have + been deprecated. These mappings are thread-local and CPU-local, meaning that the mapping can only be accessed from within this thread and the thread is bound to the @@ -82,7 +83,7 @@ list shows them in order of preference of use. for pages which are known to not come from ZONE_HIGHMEM. However, it is always safe to use kmap_local_page() / kunmap_local(). - While it is significantly faster than kmap(), for the higmem case it + While it is significantly faster than kmap(), for the highmem case it comes with restrictions about the pointers validity. Contrary to kmap() mappings, the local mappings are only valid in the context of the caller and cannot be handed to other contexts. This implies that users must @@ -100,10 +101,21 @@ list shows them in order of preference of use. (included in the "Functions" section) for details on how to manage nested mappings. -* kmap_atomic(). This permits a very short duration mapping of a single - page. Since the mapping is restricted to the CPU that issued it, it - performs well, but the issuing task is therefore required to stay on that - CPU until it has finished, lest some other task displace its mappings. +* kmap_atomic(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). Furthermore, the code between + calls to kmap_atomic() and kunmap_atomic() may implicitly depend on the side + effects of atomic mappings, i.e. disabling page faults or preemption, or both. + In that case, explicit calls to pagefault_disable() or preempt_disable() or + both must be made in conjunction with the use of kmap_local_page(). + + [Legacy documentation] + + This permits a very short duration mapping of a single page. Since the + mapping is restricted to the CPU that issued it, it performs well, but + the issuing task is therefore required to stay on that CPU until it has + finished, lest some other task displace its mappings. kmap_atomic() may also be used by interrupt contexts, since it does not sleep and the callers too may not sleep until after kunmap_atomic() is @@ -115,11 +127,20 @@ list shows them in order of preference of use. It is assumed that k[un]map_atomic() won't fail. -* kmap(). This should be used to make short duration mapping of a single - page with no restrictions on preemption or migration. It comes with an - overhead as mapping space is restricted and protected by a global lock - for synchronization. When mapping is no longer needed, the address that - the page was mapped to must be released with kunmap(). +* kmap(). This function has been deprecated; use kmap_local_page(). + + NOTE: Conversions to kmap_local_page() must take care to follow the mapping + restrictions imposed on kmap_local_page(). In particular, it is necessary to + make sure that the kernel virtual memory pointer is only valid in the thread + that obtained it. + + [Legacy documentation] + + This should be used to make short duration mapping of a single page with no + restrictions on preemption or migration. It comes with an overhead as mapping + space is restricted and protected by a global lock for synchronization. When + mapping is no longer needed, the address that the page was mapped to must be + released with kunmap(). Mapping changes must be propagated across all the CPUs. kmap() also requires global TLB invalidation when the kmap's pool wraps and it might diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index f143954e0d056..611728c49bff1 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -181,14 +181,14 @@ Consuming Reservations/Allocating a Huge Page Reservations are consumed when huge pages associated with the reservations are allocated and instantiated in the corresponding mapping. The allocation -is performed within the routine alloc_huge_page():: +is performed within the routine alloc_hugetlb_folio():: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page is passed a VMA pointer and a virtual address, so it can +alloc_hugetlb_folio is passed a VMA pointer and a virtual address, so it can consult the reservation map to determine if a reservation exists. In addition, -alloc_huge_page takes the argument avoid_reserve which indicates reserves +alloc_hugetlb_folio takes the argument avoid_reserve which indicates reserves should not be used even if it appears they have been set aside for the specified address. The avoid_reserve argument is most often used in the case of Copy on Write and Page Migration where additional copies of an existing @@ -208,7 +208,8 @@ a reservation for the allocation. After determining whether a reservation exists and can be used for the allocation, the routine dequeue_huge_page_vma() is called. This routine takes two arguments related to reservations: -- avoid_reserve, this is the same value/argument passed to alloc_huge_page() +- avoid_reserve, this is the same value/argument passed to + alloc_hugetlb_folio(). - chg, even though this argument is of type long only the values 0 or 1 are passed to dequeue_huge_page_vma. If the value is 0, it indicates a reservation exists (see the section "Memory Policy and Reservations" for @@ -233,9 +234,9 @@ the scope reservations. Even if a surplus page is allocated, the same reservation based adjustments as above will be made: SetPagePrivate(page) and resv_huge_pages--. -After obtaining a new huge page, (page)->private is set to the value of -the subpool associated with the page if it exists. This will be used for -subpool accounting when the page is freed. +After obtaining a new hugetlb folio, (folio)->_hugetlb_subpool is set to the +value of the subpool associated with the page if it exists. This will be used +for subpool accounting when the folio is freed. The routine vma_commit_reservation() is then called to adjust the reserve map based on the consumption of the reservation. In general, this involves @@ -246,8 +247,8 @@ was no reservation in a shared mapping or this was a private mapping a new entry must be created. It is possible that the reserve map could have been changed between the call -to vma_needs_reservation() at the beginning of alloc_huge_page() and the -call to vma_commit_reservation() after the page was allocated. This would +to vma_needs_reservation() at the beginning of alloc_hugetlb_folio() and the +call to vma_commit_reservation() after the folio was allocated. This would be possible if hugetlb_reserve_pages was called for the same page in a shared mapping. In such cases, the reservation count and subpool free page count will be off by one. This rare condition can be identified by comparing the diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst index d8f721f98868a..5f1f6ecbb79b9 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst @@ -141,9 +141,85 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. +Working set protection +---------------------- +Each generation is timestamped at birth. If ``lru_gen_min_ttl`` is +set, an ``lruvec`` is protected from the eviction when its oldest +generation was born within ``lru_gen_min_ttl`` milliseconds. In other +words, it prevents the working set of ``lru_gen_min_ttl`` milliseconds +from getting evicted. The OOM killer is triggered if this working set +cannot be kept in memory. + +This time-based approach has the following advantages: + +1. It is easier to configure because it is agnostic to applications + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test +and clear the accessed bit) can be expensive because pages from +different VMAs (PA space) are not cache friendly to the rmap (VA +space). For workloads mostly using mapped pages, searching the rmap +can incur the highest CPU cost in the reclaim path. + +``lru_gen_look_around()`` exploits spatial locality to reduce the +trips into the rmap. It scans the adjacent PTEs of a young PTE and +promotes hot pages. If the scan was done cacheline efficiently, it +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + +Bloom Filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be +in the set. + +In the eviction path, specifically, in ``lru_gen_look_around()``, if a +PMD has a sufficient number of hot pages, its address is placed in the +filter. In the aging path, set membership means that the PTE range +will be scanned for young pages. + +Note that Bloom filters are probabilistic on set membership. If a test +is false positive, the cost is an additional scan of a range of PTEs, +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, +since each node and memcg combination has an LRU of folios (see +``mem_cgroup_lruvec()``). Its goal is to improve the scalability of +global reclaim, which is critical to system-wide memory overcommit in +data centers. Note that memcg LRU only applies to global reclaim. + +The basic structure of an memcg LRU can be understood by an analogy to +the active/inactive LRU (of folios): + +1. It has the young and the old (generations), i.e., the counterparts + to the active and the inactive; +2. The increment of ``max_seq`` triggers promotion, i.e., the + counterpart to activation; +3. Other events trigger similar operations, e.g., offlining an memcg + triggers demotion, i.e., the counterpart to deactivation. + +In terms of global reclaim, it has two distinct features: + +1. Sharding, which allows each thread to start at a random memcg (in + the old generation) and improves parallelism; +2. Eventual fairness, which allows direct reclaim to bail out at will + and reduces latency without affecting fairness over some time. + +In terms of traversing memcgs during global reclaim, it improves the +best-case complexity from O(n) to O(1) and does not affect the +worst-case complexity O(n). Therefore, on average, it has a sublinear +complexity. + Summary ------- -The multi-gen LRU can be disassembled into the following parts: +The multi-gen LRU (of folios) can be disassembled into the following +parts: * Generations * Rmap walks diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index ec3dc5b042260..a9608fe516499 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -112,20 +112,20 @@ Refcounts and transparent huge pages Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate on head page's ->_refcount. + - get_page()/put_page() and GUP operate on the folio->_refcount. - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeeds on tail pages. - - map/unmap of PMD entry for the whole compound page increment/decrement - ->compound_mapcount, stored in the first tail page of the compound page; - and also increment/decrement ->subpages_mapcount (also in the first tail) - by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1. + - map/unmap of a PMD entry for the whole THP increment/decrement + folio->_entire_mapcount and also increment/decrement + folio->_nr_pages_mapped by COMPOUND_MAPPED when _entire_mapcount + goes from -1 to 0 or 0 to -1. - - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page, and also increment/decrement - ->subpages_mapcount, stored in first tail page of the compound page, when - _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. + - map/unmap of individual pages with PTE entry increment/decrement + page->_mapcount and also increment/decrement folio->_nr_pages_mapped + when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts + the number of pages mapped by PTE. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page @@ -153,8 +153,8 @@ clear where references should go after split: it will stay on the head page. Note that split_huge_pmd() doesn't have any limitations on refcounting: pmd can be split at any point and never fails. -Partial unmap and deferred_split_huge_page() -============================================ +Partial unmap and deferred_split_folio() +======================================== Unmapping part of THP (with munmap() or other way) is not going to free memory immediately. Instead, we detect that a subpage of THP is not in use @@ -166,6 +166,6 @@ the place where we can detect partial unmap. It also might be counterproductive since in many cases partial unmap happens during exit(2) if a THP crosses a VMA boundary. -The function deferred_split_huge_page() is used to queue a page for splitting. +The function deferred_split_folio() is used to queue a folio for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index 4a0e158aa9ce1..53e59433497a8 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -12,7 +12,7 @@ Introduction This document describes the Linux memory manager's "Unevictable LRU" infrastructure and the use of this to manage several types of "unevictable" -pages. +folios. The document attempts to provide the overall rationale behind this mechanism and the rationale for some of the design decisions that drove the @@ -27,8 +27,8 @@ The Unevictable LRU =================== The Unevictable LRU facility adds an additional LRU list to track unevictable -pages and to hide these pages from vmscan. This mechanism is based on a patch -by Larry Woodman of Red Hat to address several scalability problems with page +folios and to hide these folios from vmscan. This mechanism is based on a patch +by Larry Woodman of Red Hat to address several scalability problems with folio reclaim in Linux. The problems have been observed at customer sites on large memory x86_64 systems. @@ -52,40 +52,41 @@ The infrastructure may also be able to handle other conditions that make pages unevictable, either by definition or by circumstance, in the future. -The Unevictable LRU Page List ------------------------------ +The Unevictable LRU Folio List +------------------------------ -The Unevictable LRU page list is a lie. It was never an LRU-ordered list, but a -companion to the LRU-ordered anonymous and file, active and inactive page lists; -and now it is not even a page list. But following familiar convention, here in -this document and in the source, we often imagine it as a fifth LRU page list. +The Unevictable LRU folio list is a lie. It was never an LRU-ordered +list, but a companion to the LRU-ordered anonymous and file, active and +inactive folio lists; and now it is not even a folio list. But following +familiar convention, here in this document and in the source, we often +imagine it as a fifth LRU folio list. The Unevictable LRU infrastructure consists of an additional, per-node, LRU list -called the "unevictable" list and an associated page flag, PG_unevictable, to -indicate that the page is being managed on the unevictable list. +called the "unevictable" list and an associated folio flag, PG_unevictable, to +indicate that the folio is being managed on the unevictable list. The PG_unevictable flag is analogous to, and mutually exclusive with, the -PG_active flag in that it indicates on which LRU list a page resides when +PG_active flag in that it indicates on which LRU list a folio resides when PG_lru is set. -The Unevictable LRU infrastructure maintains unevictable pages as if they were +The Unevictable LRU infrastructure maintains unevictable folios as if they were on an additional LRU list for a few reasons: - (1) We get to "treat unevictable pages just like we treat other pages in the + (1) We get to "treat unevictable folios just like we treat other folios in the system - which means we get to use the same code to manipulate them, the same code to isolate them (for migrate, etc.), the same code to keep track of the statistics, etc..." [Rik van Riel] - (2) We want to be able to migrate unevictable pages between nodes for memory + (2) We want to be able to migrate unevictable folios between nodes for memory defragmentation, workload management and memory hotplug. The Linux kernel - can only migrate pages that it can successfully isolate from the LRU + can only migrate folios that it can successfully isolate from the LRU lists (or "Movable" pages: outside of consideration here). If we were to - maintain pages elsewhere than on an LRU-like list, where they can be - detected by isolate_lru_page(), we would prevent their migration. + maintain folios elsewhere than on an LRU-like list, where they can be + detected by folio_isolate_lru(), we would prevent their migration. -The unevictable list does not differentiate between file-backed and anonymous, -swap-backed pages. This differentiation is only important while the pages are, -in fact, evictable. +The unevictable list does not differentiate between file-backed and +anonymous, swap-backed folios. This differentiation is only important +while the folios are, in fact, evictable. The unevictable list benefits from the "arrayification" of the per-node LRU lists and statistics originally proposed and posted by Christoph Lameter. @@ -158,7 +159,7 @@ These are currently used in three places in the kernel: Detecting Unevictable Pages --------------------------- -The function page_evictable() in mm/internal.h determines whether a page is +The function folio_evictable() in mm/internal.h determines whether a folio is evictable or not using the query function outlined above [see section :ref:`Marking address spaces unevictable `] to check the AS_UNEVICTABLE flag. @@ -167,7 +168,7 @@ For address spaces that are so marked after being populated (as SHM regions might be), the lock action (e.g. SHM_LOCK) can be lazy, and need not populate the page tables for the region as does, for example, mlock(), nor need it make any special effort to push any pages in the SHM_LOCK'd area to the unevictable -list. Instead, vmscan will do this if and when it encounters the pages during +list. Instead, vmscan will do this if and when it encounters the folios during a reclamation scan. On an unlock action (such as SHM_UNLOCK), the unlocker (e.g. shmctl()) must scan @@ -176,41 +177,43 @@ condition is keeping them unevictable. If an unevictable region is destroyed, the pages are also "rescued" from the unevictable list in the process of freeing them. -page_evictable() also checks for mlocked pages by testing an additional page -flag, PG_mlocked (as wrapped by PageMlocked()), which is set when a page is -faulted into a VM_LOCKED VMA, or found in a VMA being VM_LOCKED. +folio_evictable() also checks for mlocked folios by calling +folio_test_mlocked(), which is set when a folio is faulted into a +VM_LOCKED VMA, or found in a VMA being VM_LOCKED. -Vmscan's Handling of Unevictable Pages --------------------------------------- +Vmscan's Handling of Unevictable Folios +--------------------------------------- -If unevictable pages are culled in the fault path, or moved to the unevictable -list at mlock() or mmap() time, vmscan will not encounter the pages until they +If unevictable folios are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will not encounter the folios until they have become evictable again (via munlock() for example) and have been "rescued" from the unevictable list. However, there may be situations where we decide, -for the sake of expediency, to leave an unevictable page on one of the regular +for the sake of expediency, to leave an unevictable folio on one of the regular active/inactive LRU lists for vmscan to deal with. vmscan checks for such -pages in all of the shrink_{active|inactive|page}_list() functions and will -"cull" such pages that it encounters: that is, it diverts those pages to the +folios in all of the shrink_{active|inactive|page}_list() functions and will +"cull" such folios that it encounters: that is, it diverts those folios to the unevictable list for the memory cgroup and node being scanned. -There may be situations where a page is mapped into a VM_LOCKED VMA, but the -page is not marked as PG_mlocked. Such pages will make it all the way to -shrink_active_list() or shrink_page_list() where they will be detected when -vmscan walks the reverse map in folio_referenced() or try_to_unmap(). The page -is culled to the unevictable list when it is released by the shrinker. +There may be situations where a folio is mapped into a VM_LOCKED VMA, +but the folio does not have the mlocked flag set. Such folios will make +it all the way to shrink_active_list() or shrink_page_list() where they +will be detected when vmscan walks the reverse map in folio_referenced() +or try_to_unmap(). The folio is culled to the unevictable list when it +is released by the shrinker. -To "cull" an unevictable page, vmscan simply puts the page back on the LRU list -using putback_lru_page() - the inverse operation to isolate_lru_page() - after -dropping the page lock. Because the condition which makes the page unevictable -may change once the page is unlocked, __pagevec_lru_add_fn() will recheck the -unevictable state of a page before placing it on the unevictable list. +To "cull" an unevictable folio, vmscan simply puts the folio back on +the LRU list using folio_putback_lru() - the inverse operation to +folio_isolate_lru() - after dropping the folio lock. Because the +condition which makes the folio unevictable may change once the folio +is unlocked, __pagevec_lru_add_fn() will recheck the unevictable state +of a folio before placing it on the unevictable list. MLOCKED Pages ============= -The unevictable page list is also useful for mlock(), in addition to ramfs and +The unevictable folio list is also useful for mlock(), in addition to ramfs and SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in NOMMU situations, all mappings are effectively mlocked. @@ -295,7 +298,7 @@ treated as a no-op and mlock_fixup() simply returns. If the VMA passes some filtering as described in "Filtering Special VMAs" below, mlock_fixup() will attempt to merge the VMA with its neighbors or split off a subset of the VMA if the range does not cover the entire VMA. Any pages -already present in the VMA are then marked as mlocked by mlock_page() via +already present in the VMA are then marked as mlocked by mlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range(). Before returning from the system call, do_mlock() or mlockall() will call @@ -308,22 +311,22 @@ do end up getting faulted into this VM_LOCKED VMA, they will be handled in the fault path - which is also how mlock2()'s MLOCK_ONFAULT areas are handled. For each PTE (or PMD) being faulted into a VMA, the page add rmap function -calls mlock_vma_page(), which calls mlock_page() when the VMA is VM_LOCKED +calls mlock_vma_folio(), which calls mlock_folio() when the VMA is VM_LOCKED (unless it is a PTE mapping of a part of a transparent huge page). Or when -it is a newly allocated anonymous page, lru_cache_add_inactive_or_unevictable() -calls mlock_new_page() instead: similar to mlock_page(), but can make better +it is a newly allocated anonymous page, folio_add_lru_vma() calls +mlock_new_folio() instead: similar to mlock_folio(), but can make better judgments, since this page is held exclusively and known not to be on LRU yet. -mlock_page() sets PageMlocked immediately, then places the page on the CPU's -mlock pagevec, to batch up the rest of the work to be done under lru_lock by -__mlock_page(). __mlock_page() sets PageUnevictable, initializes mlock_count +mlock_folio() sets PG_mlocked immediately, then places the page on the CPU's +mlock folio batch, to batch up the rest of the work to be done under lru_lock by +__mlock_folio(). __mlock_folio() sets PG_unevictable, initializes mlock_count and moves the page to unevictable state ("the unevictable LRU", but with -mlock_count in place of LRU threading). Or if the page was already PageLRU -and PageUnevictable and PageMlocked, it simply increments the mlock_count. +mlock_count in place of LRU threading). Or if the page was already PG_lru +and PG_unevictable and PG_mlocked, it simply increments the mlock_count. But in practice that may not work ideally: the page may not yet be on an LRU, or it may have been temporarily isolated from LRU. In such cases the mlock_count -field cannot be touched, but will be set to 0 later when __pagevec_lru_add_fn() +field cannot be touched, but will be set to 0 later when __munlock_folio() returns the page to "LRU". Races prohibit mlock_count from being set to 1 then: rather than risk stranding a page indefinitely as unevictable, always err with mlock_count on the low side, so that when munlocked the page will be rescued to @@ -370,20 +373,21 @@ Because of the VMA filtering discussed above, VM_LOCKED will not be set in any "special" VMAs. So, those VMAs will be ignored for munlock. If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the -specified range. All pages in the VMA are then munlocked by munlock_page() via +specified range. All pages in the VMA are then munlocked by munlock_folio() via mlock_pte_range() via walk_page_range() via mlock_vma_pages_range() - the same function used when mlocking a VMA range, with new flags for the VMA indicating that it is munlock() being performed. -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. @@ -410,7 +414,7 @@ However, since mlock_vma_pages_range() starts by setting VM_LOCKED on a VMA, before mlocking any pages already present, if one of those pages were migrated before mlock_pte_range() reached it, it would get counted twice in mlock_count. To prevent that, mlock_vma_pages_range() temporarily marks the VMA as VM_IO, -so that mlock_vma_page() will skip it. +so that mlock_vma_folio() will skip it. To complete page migration, we place the old and new pages back onto the LRU afterwards. The "unneeded" page - old page on success, new page on failure - @@ -483,18 +487,19 @@ Before the unevictable/mlock changes, mlocking did not mark the pages in any way, so unmapping them required no processing. For each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). -munlock_page() uses the mlock pagevec to batch up work to be done under -lru_lock by __munlock_page(). __munlock_page() decrements the page's -mlock_count, and when that reaches 0 it clears PageMlocked and clears -PageUnevictable, moving the page from unevictable state to inactive LRU. +munlock_folio() uses the mlock pagevec to batch up work to be done +under lru_lock by __munlock_folio(). __munlock_folio() decrements the +folio's mlock_count, and when that reaches 0 it clears the mlocked flag +and clears the unevictable flag, moving the folio from unevictable state +to the inactive LRU. -But in practice that may not work ideally: the page may not yet have reached +But in practice that may not work ideally: the folio may not yet have reached "the unevictable LRU", or it may have been temporarily isolated from it. In those cases its mlock_count field is unusable and must be assumed to be 0: so -that the page will be rescued to an evictable LRU, then perhaps be mlocked +that the folio will be rescued to an evictable LRU, then perhaps be mlocked again later if vmscan finds it in a VM_LOCKED VMA. @@ -507,7 +512,7 @@ which had been Copied-On-Write from the file pages now being truncated. Mlocked pages can be munlocked and deleted in this way: like with munmap(), for each PTE (or PMD) being unmapped from a VMA, page_remove_rmap() calls -munlock_vma_page(), which calls munlock_page() when the VMA is VM_LOCKED +munlock_vma_folio(), which calls munlock_folio() when the VMA is VM_LOCKED (unless it was a PTE mapping of a part of a transparent huge page). However, if there is a racing munlock(), since mlock_vma_pages_range() starts @@ -515,7 +520,7 @@ munlocking by clearing VM_LOCKED from a VMA, before munlocking all the pages present, if one of those pages were unmapped by truncation or hole punch before mlock_pte_range() reached it, it would not be recognized as mlocked by this VMA, and would not be counted out of mlock_count. In this rare case, a page may -still appear as PageMlocked after it has been fully unmapped: and it is left to +still appear as PG_mlocked after it has been fully unmapped: and it is left to release_pages() (or __page_cache_release()) to clear it and update statistics before freeing (this event is counted in /proc/vmstat unevictable_pgs_cleared, which is usually 0). @@ -527,7 +532,7 @@ Page Reclaim in shrink_*_list() vmscan's shrink_active_list() culls any obviously unevictable pages - i.e. !page_evictable(page) pages - diverting those to the unevictable list. However, shrink_active_list() only sees unevictable pages that made it onto the -active/inactive LRU lists. Note that these pages do not have PageUnevictable +active/inactive LRU lists. Note that these pages do not have PG_unevictable set - otherwise they would be on the unevictable list and shrink_active_list() would never see them. @@ -549,6 +554,6 @@ and node unevictable list. rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), -check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page() +check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_folio() to correct them. Such pages are culled to the unevictable list when released by the shrinker. diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index 6e79893d61326..40323c9b39d88 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -80,3 +80,171 @@ Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f * ZS_EMPTY when n == 0 * ZS_FULL when n == N + + +Internals +========= + +zsmalloc has 255 size classes, each of which can hold a number of zspages. +Each zspage can contain up to ZSMALLOC_CHAIN_SIZE physical (0-order) pages. +The optimal zspage chain size for each size class is calculated during the +creation of the zsmalloc pool (see calculate_zspage_chain_size()). + +As an optimization, zsmalloc merges size classes that have similar +characteristics in terms of the number of pages per zspage and the number +of objects that each zspage can store. + +For instance, consider the following size classes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 94 1536 0 0 0 0 0 3 0 + 100 1632 0 0 0 0 0 2 0 + ... + + +Size classes #95-99 are merged with size class #100. This means that when we +need to store an object of size, say, 1568 bytes, we end up using size class +#100 instead of size class #96. Size class #100 is meant for objects of size +1632 bytes, so each object of size 1568 bytes wastes 1632-1568=64 bytes. + +Size class #100 consists of zspages with 2 physical pages each, which can +hold a total of 5 objects. If we need to store 13 objects of size 1568, we +end up allocating three zspages, or 6 physical pages. + +However, if we take a closer look at size class #96 (which is meant for +objects of size 1568 bytes) and trace `calculate_zspage_chain_size()`, we +find that the most optimal zspage configuration for this class is a chain +of 5 physical pages::: + + pages per zspage wasted bytes used% + 1 960 76 + 2 352 95 + 3 1312 89 + 4 704 95 + 5 96 99 + +This means that a class #96 configuration with 5 physical pages can store 13 +objects of size 1568 in a single zspage, using a total of 5 physical pages. +This is more efficient than the class #100 configuration, which would use 6 +physical pages to store the same number of objects. + +As the zspage chain size for class #96 increases, its key characteristics +such as pages per-zspage and objects per-zspage also change. This leads to +dewer class mergers, resulting in a more compact grouping of classes, which +reduces memory wastage. + +Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages +per zspage. Any object larger than 3264 bytes is considered huge and belongs +to size class #254, which stores each object in its own physical page (objects +in huge classes do not share pages). + +Increasing the size of the chain of zspages also results in a higher watermark +for the huge size class and fewer huge classes overall. This allows for more +efficient storage of large objects. + +For zspage chain size of 8, huge class watermark becomes 3632 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 211 3408 0 0 0 0 0 5 0 + 217 3504 0 0 0 0 0 6 0 + 222 3584 0 0 0 0 0 7 0 + 225 3632 0 0 0 0 0 8 0 + 254 4096 0 0 0 0 0 1 0 + ... + +For zspage chain size of 16, huge class watermark becomes 3840 bytes::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + 202 3264 0 0 0 0 0 4 0 + 206 3328 0 0 0 0 0 13 0 + 207 3344 0 0 0 0 0 9 0 + 208 3360 0 0 0 0 0 14 0 + 211 3408 0 0 0 0 0 5 0 + 212 3424 0 0 0 0 0 16 0 + 214 3456 0 0 0 0 0 11 0 + 217 3504 0 0 0 0 0 6 0 + 219 3536 0 0 0 0 0 13 0 + 222 3584 0 0 0 0 0 7 0 + 223 3600 0 0 0 0 0 15 0 + 225 3632 0 0 0 0 0 8 0 + 228 3680 0 0 0 0 0 9 0 + 230 3712 0 0 0 0 0 10 0 + 232 3744 0 0 0 0 0 11 0 + 234 3776 0 0 0 0 0 12 0 + 235 3792 0 0 0 0 0 13 0 + 236 3808 0 0 0 0 0 14 0 + 238 3840 0 0 0 0 0 15 0 + 254 4096 0 0 0 0 0 1 0 + ... + +Overall the combined zspage chain size effect on zsmalloc pool configuration::: + + pages per zspage number of size classes (clusters) huge size class watermark + 4 69 3264 + 5 86 3408 + 6 93 3504 + 7 112 3584 + 8 123 3632 + 9 140 3680 + 10 143 3712 + 11 159 3744 + 12 164 3776 + 13 180 3792 + 14 183 3808 + 15 188 3840 + 16 191 3840 + + +A synthetic test +---------------- + +zram as a build artifacts storage (Linux kernel compilation). + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=4` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 13 51 413836 412973 159955 3 + + zram mm_stat::: + + 1691783168 628083717 655175680 0 655175680 60 0 34048 34049 + + +* `CONFIG_ZSMALLOC_CHAIN_SIZE=8` + + zsmalloc classes stats::: + + class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable + ... + Total 18 87 414852 412978 156666 0 + + zram mm_stat::: + + 1691803648 627793930 641703936 0 641703936 60 0 33591 33591 + +Using larger zspage chains may result in using fewer physical pages, as seen +in the example where the number of physical pages used decreased from 159955 +to 156666, at the same time maximum zsmalloc pool memory usage went down from +655175680 to 641703936 bytes. + +However, this advantage may be offset by the potential for increased system +memory pressure (as some zspages have larger chain sizes) in cases where there +is heavy internal fragmentation and zspool compaction is unable to relocate +objects and release zspages. In these cases, it is recommended to decrease +the limit on the size of the zspage chains (as specified by the +CONFIG_ZSMALLOC_CHAIN_SIZE option). diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index 752e5696cd476..826a50c47389e 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -142,14 +142,14 @@ HPAGE_RESV_OWNER标志被设置,以表明该VMA拥有预留。 消耗预留/分配一个巨页 =========================== -当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_huge_page() +当与预留相关的巨页在相应的映射中被分配和实例化时,预留就被消耗了。该分配是在函数alloc_hugetlb_folio() 中进行的:: - struct page *alloc_huge_page(struct vm_area_struct *vma, + struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -alloc_huge_page被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 -此外,alloc_huge_page需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 +alloc_hugetlb_folio被传递给一个VMA指针和一个虚拟地址,因此它可以查阅预留映射以确定是否存在预留。 +此外,alloc_hugetlb_folio需要一个参数avoid_reserve,该参数表示即使看起来已经为指定的地址预留了 预留,也不应该使用预留。avoid_reserve参数最常被用于写时拷贝和页面迁移的情况下,即现有页面的额 外拷贝被分配。 @@ -162,7 +162,7 @@ vma_needs_reservation()返回的值通常为0或1。如果该地址存在预留 确定预留是否存在并可用于分配后,调用dequeue_huge_page_vma()函数。这个函数需要两个与预留有关 的参数: -- avoid_reserve,这是传递给alloc_huge_page()的同一个值/参数。 +- avoid_reserve,这是传递给alloc_hugetlb_folio()的同一个值/参数。 - chg,尽管这个参数的类型是long,但只有0或1的值被传递给dequeue_huge_page_vma。如果该值为0, 则表明存在预留(关于可能的问题,请参见 “预留和内存策略” 一节)。如果值 为1,则表示不存在预留,如果可能的话,必须从全局空闲池中取出该页。 @@ -179,7 +179,7 @@ free_huge_pages的值被递减。如果有一个与该页相关的预留,将 的剩余巨页和超额分配的问题。即使分配了一个多余的页面,也会进行与上面一样的基于预留的调整: SetPagePrivate(page) 和 resv_huge_pages--. -在获得一个新的巨页后,(page)->private被设置为与该页面相关的子池的值,如果它存在的话。当页 +在获得一个新的巨页后,(folio)->_hugetlb_subpool被设置为与该页面相关的子池的值,如果它存在的话。当页 面被释放时,这将被用于子池的计数。 然后调用函数vma_commit_reservation(),根据预留的消耗情况调整预留映射。一般来说,这涉及 @@ -199,7 +199,7 @@ SetPagePrivate(page)和resv_huge_pages-。 已经存在,所以不做任何改变。然而,如果共享映射中没有预留,或者这是一个私有映射,则必须创建 一个新的条目。 -在alloc_huge_page()开始调用vma_needs_reservation()和页面分配后调用 +在alloc_hugetlb_folio()开始调用vma_needs_reservation()和页面分配后调用 vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_reserve_pages在共 享映射中为同一页面被调用,这将是可能的。在这种情况下,预留计数和子池空闲页计数会有一个偏差。 这种罕见的情况可以通过比较vma_needs_reservation和vma_commit_reservation的返回值来 diff --git a/MAINTAINERS b/MAINTAINERS index 4398f1c8ce9e1..abed40db41f06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5643,6 +5643,11 @@ M: SeongJae Park L: damon@lists.linux.dev L: linux-mm@kvack.org S: Maintained +W: https://damonitor.github.io +P: Documentation/mm/damon/maintainer-profile.rst +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new +T: git git://git.kernel.org/pub/scm/linux/kernel/git/sj/linux.git damon/next F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ F: Documentation/mm/damon/ diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 8f3f5eecba28b..4db1ebc0ed99c 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -17,9 +17,8 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) @@ -87,10 +86,6 @@ typedef struct page *pgtable_t; #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) pfn_valid((__pa(kaddr) >> PAGE_SHIFT)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include #include diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 9e45f6735d5d2..ba43cb841d19c 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -74,6 +74,9 @@ struct vm_area_struct; #define _PAGE_DIRTY 0x20000 #define _PAGE_ACCESSED 0x40000 +/* We borrow bit 39 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x8000000000UL + /* * NOTE! The "accessed" bit isn't necessarily exact: it can be kept exactly * by software (use the KRE/URE/KWE/UWE bits appropriately), but I'll fake it. @@ -301,18 +304,47 @@ extern inline void update_mmu_cache(struct vm_area_struct * vma, } /* - * Non-present pages: high 24 bits are offset, next 8 bits type, - * low 32 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------- offset ------------------> E <--- type --> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------------------- zeroes --------------------------> + * + * E is the exclusive marker that is not stored in swap entries. */ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 32) | (offset << 40); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 32) | (offset << 40); return pte; } -#define __swp_type(x) (((x).val >> 32) & 0xff) +#define __swp_type(x) (((x).val >> 32) & 0x7f) #define __swp_offset(x) ((x).val >> 40) #define __swp_entry(type, off) ((swp_entry_t) { pte_val(mk_swap_pte((type), (off))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index ce20c31828a01..0eddd22c6212f 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -73,7 +73,7 @@ struct halt_info { static void common_shutdown_1(void *generic_ptr) { - struct halt_info *how = (struct halt_info *)generic_ptr; + struct halt_info *how = generic_ptr; struct percpu_struct *cpup; unsigned long *pflags, flags; int cpuid = smp_processor_id(); diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index f4e20f75438f8..0ede4b044e869 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -628,7 +628,7 @@ flush_tlb_all(void) static void ipi_flush_tlb_mm(void *x) { - struct mm_struct *mm = (struct mm_struct *) x; + struct mm_struct *mm = x; if (mm == current->active_mm && !asn_locked()) flush_tlb_current(mm); else @@ -670,7 +670,7 @@ struct flush_tlb_page_struct { static void ipi_flush_tlb_page(void *x) { - struct flush_tlb_page_struct *data = (struct flush_tlb_page_struct *)x; + struct flush_tlb_page_struct *data = x; struct mm_struct * mm = data->mm; if (mm == current->active_mm && !asn_locked()) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 9a62e1d87967f..e43fe27ec54d5 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -109,7 +109,6 @@ extern int pfn_valid(unsigned long pfn); #else /* CONFIG_HIGHMEM */ #define ARCH_PFN_OFFSET virt_to_pfn(CONFIG_LINUX_RAM_BASE) -#define pfn_valid(pfn) (((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #endif /* CONFIG_HIGHMEM */ diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 515e82db519fe..6e9f8ca6d6a16 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -26,6 +26,9 @@ #define _PAGE_GLOBAL (1 << 8) /* ASID agnostic (H) */ #define _PAGE_PRESENT (1 << 9) /* PTE/TLB Valid (H) */ +/* We borrow bit 5 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + #ifdef CONFIG_ARC_MMU_V4 #define _PAGE_HW_SZ (1 << 10) /* Normal/super (H) */ #else @@ -106,9 +109,18 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -/* Encode swap {type,off} tuple into PTE - * We reserve 13 bits for 5-bit @type, keeping bits 12-5 zero, ensuring that - * PAGE_PRESENT is zero in a PTE holding swap "identifier" +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset -------------> <--- zero --> E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. */ #define __swp_entry(type, off) ((swp_entry_t) \ { ((type) & 0x1f) | ((off) << 13) }) @@ -120,6 +132,14 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(swp_mkexclusive, |= (_PAGE_SWP_EXCLUSIVE)); +PTE_BIT_FUNC(swp_clear_exclusive, &= ~(_PAGE_SWP_EXCLUSIVE)); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #include #endif diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index d8eef4bd8c711..62e9df0244457 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -386,6 +386,4 @@ static inline unsigned long __virt_to_idmap(unsigned long x) #endif -#include - #endif diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h index 5fcc8a600e36d..74bb5947b387a 100644 --- a/arch/arm/include/asm/page.h +++ b/arch/arm/include/asm/page.h @@ -158,6 +158,7 @@ typedef struct page *pgtable_t; #ifdef CONFIG_HAVE_ARCH_PFN_VALID extern int pfn_valid(unsigned long); +#define pfn_valid pfn_valid #endif #include @@ -167,5 +168,6 @@ extern int pfn_valid(unsigned long); #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC #include +#include #endif diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 92abd4cd8ca2d..ce543cd9380cd 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -126,6 +126,9 @@ #define L_PTE_SHARED (_AT(pteval_t, 1) << 10) /* shared(v6), coherent(xsc3) */ #define L_PTE_NONE (_AT(pteval_t, 1) << 11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE L_PTE_RDONLY + /* * These are the memory types, defined to be compatible with * pre-ARMv6 CPUs cacheable and bufferable bits: n/a,n/a,C,B diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index eabe72ff73815..1060497915000 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -76,6 +76,9 @@ #define L_PTE_NONE (_AT(pteval_t, 1) << 57) /* PROT_NONE */ #define L_PTE_RDONLY (_AT(pteval_t, 1) << 58) /* READ ONLY */ +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define L_PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 7) + #define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) #define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) #define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f049072b2e858..2e626e6da9a34 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -271,27 +271,47 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } /* - * Encode and decode a swap entry. Swap entries are stored in the Linux - * page tables as follows: + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: * * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 - * <--------------- offset ------------------------> < type -> 0 0 + * <------------------- offset ------------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. * - * This gives us up to 31 swap files and 128GB per swap file. Note that + * This gives us up to 31 swap files and 64GB per swap file. Note that * the offset field is always non-zero. */ #define __SWP_TYPE_SHIFT 2 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) -#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) +#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT + 1) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & __SWP_TYPE_BITS) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(swp) __pte((swp).val | PTE_TYPE_FAULT) +#define __swp_entry_to_pte(swp) __pte((swp).val) + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return set_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return clear_pte_bit(pte, __pgprot(L_PTE_SWP_EXCLUSIVE)); +} /* * It is an error for the kernel to have more swap files than we can diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index c81e7be2b4eab..0e8ff85890ade 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -315,7 +315,7 @@ static int __init gate_vma_init(void) gate_vma.vm_page_prot = PAGE_READONLY_EXEC; gate_vma.vm_start = 0xffff0000; gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; - gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC); return 0; } arch_initcall(gate_vma_init); diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 993a27ea6f543..2312e6ee595fd 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -29,9 +29,9 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_highpage(struct page *to, struct page *from); #define __HAVE_ARCH_COPY_HIGHPAGE -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio void tag_clear_highpage(struct page *to); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGE diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 27455bfd64bc3..b6ba466e2e8a3 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -421,7 +421,6 @@ static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT); } -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 596f46dabe4ef..f4cb0f85ccf49 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -925,7 +925,7 @@ NOKPROBE_SYMBOL(do_debug_exception); /* * Used during anonymous page fault handling. */ -struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO; @@ -938,7 +938,7 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return alloc_page_vma(flags, vma, vaddr); + return vma_alloc_folio(flags, 0, vma, vaddr, false); } void tag_clear_highpage(struct page *page) diff --git a/arch/csky/abiv1/inc/abi/pgtable-bits.h b/arch/csky/abiv1/inc/abi/pgtable-bits.h index 752c8b3f91945..ae7a2f76dd424 100644 --- a/arch/csky/abiv1/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv1/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_ACCESSED (1<<3) #define _PAGE_MODIFIED (1<<4) +/* We borrow bit 9 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<9) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<6) #define _PAGE_VALID (1<<7) @@ -26,7 +29,8 @@ #define _PAGE_PROT_NONE _PAGE_READ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) @@ -35,15 +39,16 @@ * bit 6: _PAGE_GLOBAL (zero) * bit 7: _PAGE_VALID (zero) * bit 8: swap type[4] - * bit 9 - 31: swap offset + * bit 9: exclusive marker + * bit 10 - 31: swap offset */ #define __swp_type(x) ((((x).val >> 2) & 0xf) | \ (((x).val >> 4) & 0x10)) -#define __swp_offset(x) ((x).val >> 9) +#define __swp_offset(x) ((x).val >> 10) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0xf) << 2) | \ ((type & 0x10) << 4) | \ - ((offset) << 9)}) + ((offset) << 10)}) #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/csky/abiv2/inc/abi/pgtable-bits.h b/arch/csky/abiv2/inc/abi/pgtable-bits.h index 7e7f389f546fb..526152bd21562 100644 --- a/arch/csky/abiv2/inc/abi/pgtable-bits.h +++ b/arch/csky/abiv2/inc/abi/pgtable-bits.h @@ -10,6 +10,9 @@ #define _PAGE_PRESENT (1<<10) #define _PAGE_MODIFIED (1<<11) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<7) + /* implemented in hardware */ #define _PAGE_GLOBAL (1<<0) #define _PAGE_VALID (1<<1) @@ -26,23 +29,25 @@ #define _PAGE_PROT_NONE _PAGE_WRITE /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_GLOBAL (zero) * bit 1: _PAGE_VALID (zero) * bit 2 - 6: swap type - * bit 7 - 8: swap offset[0 - 1] + * bit 7: exclusive marker + * bit 8: swap offset[0] * bit 9: _PAGE_WRITE (zero) * bit 10: _PAGE_PRESENT (zero) - * bit 11 - 31: swap offset[2 - 22] + * bit 11 - 31: swap offset[1 - 21] */ #define __swp_type(x) (((x).val >> 2) & 0x1f) -#define __swp_offset(x) ((((x).val >> 7) & 0x3) | \ - (((x).val >> 9) & 0x7ffffc)) +#define __swp_offset(x) ((((x).val >> 8) & 0x1) | \ + (((x).val >> 10) & 0x3ffffe)) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type & 0x1f) << 2) | \ - ((offset & 0x3) << 7) | \ - ((offset & 0x7ffffc) << 9)}) + ((offset & 0x1) << 8) | \ + ((offset & 0x3ffffe) << 10)}) #endif /* __ASM_CSKY_PGTABLE_BITS_H */ diff --git a/arch/csky/include/asm/page.h b/arch/csky/include/asm/page.h index ed7451478b1b0..b23e3006a9e06 100644 --- a/arch/csky/include/asm/page.h +++ b/arch/csky/include/asm/page.h @@ -39,7 +39,6 @@ #define virt_addr_valid(kaddr) ((void *)(kaddr) >= (void *)PAGE_OFFSET && \ (void *)(kaddr) < high_memory) -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) extern void *memset(void *dest, int c, size_t l); extern void *memcpy(void *to, const void *from, size_t l); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 77bc6caff2d23..d4042495febc0 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,6 +200,23 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index d7d4f9fca3279..9c03b9965f07a 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -95,7 +95,6 @@ struct page; /* Default vm area behavior is non-executable. */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_NON_EXEC -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) /* Need to not use a define for linesize; may move this to another file. */ diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index f7048c18b6f97..59393613d0862 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -61,6 +61,9 @@ extern unsigned long empty_zero_page; * So we'll put up with a bit of inefficiency for now... */ +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<6) + /* * Top "FOURTH" level (pgd), which for the Hexagon VM is really * only the second from the bottom, pgd and pud both being collapsed. @@ -359,9 +362,12 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is * interpreted as swap information. The remaining free bits are interpreted as - * swap type/offset tuple. Rather than have the TLB fill handler test + * listed below. Rather than have the TLB fill handler test * _PAGE_PRESENT, we're going to reserve the permissions bits and set them to * all zeros for swap entries, which speeds up the miss handler at the cost of * 3 bits of offset. That trade-off can be revisited if necessary, but Hexagon @@ -371,9 +377,10 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * Format of swap PTE: * bit 0: Present (zero) * bits 1-5: swap type (arch independent layer uses 5 bits max) - * bits 6-9: bits 3:0 of offset + * bit 6: exclusive marker + * bits 7-9: bits 2:0 of offset * bits 10-12: effectively _PAGE_PROTNONE (all zero) - * bits 13-31: bits 22:4 of swap offset + * bits 13-31: bits 21:3 of swap offset * * The split offset makes some of the following macros a little gnarly, * but there's plenty of precedent for this sort of thing. @@ -383,11 +390,28 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_type(swp_pte) (((swp_pte).val >> 1) & 0x1f) #define __swp_offset(swp_pte) \ - ((((swp_pte).val >> 6) & 0xf) | (((swp_pte).val >> 9) & 0x7ffff0)) + ((((swp_pte).val >> 7) & 0x7) | (((swp_pte).val >> 10) & 0x3ffff8)) #define __swp_entry(type, offset) \ ((swp_entry_t) { \ - ((type << 1) | \ - ((offset & 0x7ffff0) << 9) | ((offset & 0xf) << 6)) }) + (((type & 0x1f) << 1) | \ + ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} #endif diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index 1b990466d5404..310b09c3342d6 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -82,25 +82,19 @@ do { \ } while (0) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ ({ \ - struct page *page = alloc_page_vma( \ - GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr); \ - if (page) \ - flush_dcache_page(page); \ - page; \ + struct folio *folio = vma_alloc_folio( \ + GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false); \ + if (folio) \ + flush_dcache_folio(folio); \ + folio; \ }) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE - #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include -#ifdef CONFIG_FLATMEM -# define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 01517a5e67789..21c97e31a28ae 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -58,6 +58,9 @@ #define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */ #define _PAGE_PROTNONE (__IA64_UL(1) << 63) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #define _PFN_MASK _PAGE_PPN_MASK /* Mask of bits which may be changed by pte_modify(); the odd bits are there for _PAGE_PROTNONE */ #define _PAGE_CHG_MASK (_PAGE_P | _PAGE_PROTNONE | _PAGE_PL_MASK | _PAGE_AR_MASK | _PAGE_ED) @@ -399,6 +402,9 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init (void); /* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of * bits in the swap-type field of the swap pte. It would be nice to * enforce that, but we can't easily include here. @@ -406,16 +412,35 @@ extern void paging_init (void); * * Format of swap pte: * bit 0 : present bit (must be zero) - * bits 1- 7: swap-type + * bits 1- 6: swap type + * bit 7 : exclusive marker * bits 8-62: swap offset * bit 63 : _PAGE_PROTNONE bit */ -#define __swp_type(entry) (((entry).val >> 1) & 0x7f) +#define __swp_type(entry) (((entry).val >> 1) & 0x3f) #define __swp_offset(entry) (((entry).val << 1) >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 1) | ((long) (offset) << 8) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type & 0x3f) << 1) | \ + ((long) (offset) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index fc4e4217e87ff..7f5353e28516f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -109,7 +109,7 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; + vm_flags_init(vma, VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { @@ -127,8 +127,8 @@ ia64_init_addr_space (void) vma_set_anonymous(vma); vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); - vma->vm_flags = VM_READ | VM_MAYREAD | VM_IO | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_init(vma, VM_READ | VM_MAYREAD | VM_IO | + VM_DONTEXPAND | VM_DONTDUMP); mmap_write_lock(current->mm); if (insert_vm_struct(current->mm, vma)) { mmap_write_unlock(current->mm); @@ -272,7 +272,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX); return 0; diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 53f284a961823..fb5338b352e65 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -82,19 +82,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 3d1e0a69975a5..8b98d22a145b1 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -20,6 +20,7 @@ #define _PAGE_SPECIAL_SHIFT 11 #define _PAGE_HGLOBAL_SHIFT 12 /* HGlobal is a PMD bit */ #define _PAGE_PFN_SHIFT 12 +#define _PAGE_SWP_EXCLUSIVE_SHIFT 23 #define _PAGE_PFN_END_SHIFT 48 #define _PAGE_NO_READ_SHIFT 61 #define _PAGE_NO_EXEC_SHIFT 62 @@ -33,6 +34,9 @@ #define _PAGE_PROTNONE (_ULCAST_(1) << _PAGE_PROTNONE_SHIFT) #define _PAGE_SPECIAL (_ULCAST_(1) << _PAGE_SPECIAL_SHIFT) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (_ULCAST_(1) << _PAGE_SWP_EXCLUSIVE_SHIFT) + /* Used by TLB hardware (placed in EntryLo*) */ #define _PAGE_VALID (_ULCAST_(1) << _PAGE_VALID_SHIFT) #define _PAGE_DIRTY (_ULCAST_(1) << _PAGE_DIRTY_SHIFT) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 7a34e900d8c18..d28fb9dbec596 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -249,13 +249,26 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <--- type ---> <---------- zeroes ----------> + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT and _PAGE_PROTNONE. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) @@ -263,6 +276,23 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void paging_init(void); #define pte_none(pte) (!(pte_val(pte) & ~_PAGE_GLOBAL)) diff --git a/arch/loongarch/include/asm/tlb.h b/arch/loongarch/include/asm/tlb.h index dd24f5898f651..f5e4deb97402f 100644 --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -149,7 +149,7 @@ static inline void tlb_flush(struct mmu_gather *tlb) struct vm_area_struct vma; vma.vm_mm = tlb->mm; - vma.vm_flags = 0; + vm_flags_init(&vma, 0); if (tlb->fullmm) { flush_tlb_mm(tlb->mm); return; diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index b619b22823f84..13741c1245e1a 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -46,6 +46,9 @@ #define _CACHEMASK040 (~0x060) #define _PAGE_GLOBAL040 0x400 /* 68040 global bit, used for kva descs */ +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE CF_PAGE_NOCACHE + /* * Externally used page protection values. */ @@ -254,15 +257,41 @@ static inline pte_t pte_mkcache(pte_t pte) extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; /* - * Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------ offset -------------> 0 0 0 E <-- type ---> + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(x) ((x).val & 0xFF) +#define __swp_type(x) ((x).val & 0x7f) #define __swp_offset(x) ((x).val >> 11) -#define __swp_entry(typ, off) ((swp_entry_t) { (typ) | \ +#define __swp_entry(typ, off) ((swp_entry_t) { ((typ) & 0x7f) | \ (off << 11) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 7ac3d64c6b33e..ec0dc19ab8343 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -41,6 +41,9 @@ #define _PAGE_PROTNONE 0x004 +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x800 + #ifndef __ASSEMBLY__ /* This is the cache mode to be used for pages containing page descriptors for @@ -124,7 +127,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) * expects pmd_page() to exists, only to then DCE it all. Provide a dummy to * make the compiler happy. */ -#define pmd_page(pmd) NULL +#define pmd_page(pmd) ((struct page *)NULL) #define pud_none(pud) (!pud_val(pud)) @@ -169,12 +172,40 @@ static inline pte_t pte_mkcache(pte_t pte) #define swapper_pg_dir kernel_pg_dir extern pgd_t kernel_pg_dir[128]; -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -#define __swp_type(x) (((x).val >> 4) & 0xff) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------> E <-- type ---> 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) (((x).val >> 4) & 0x7f) #define __swp_offset(x) ((x).val >> 12) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 12) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x7f) << 4) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* _MOTOROLA_PGTABLE_H */ diff --git a/arch/m68k/include/asm/page.h b/arch/m68k/include/asm/page.h index 2f1c54e4725d4..a5993ad83ed8a 100644 --- a/arch/m68k/include/asm/page.h +++ b/arch/m68k/include/asm/page.h @@ -62,11 +62,7 @@ extern unsigned long _ramend; #include #endif -#ifndef CONFIG_MMU -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) PFN_PHYS(pfn) -#endif - #include +#include #endif /* _M68K_PAGE_H */ diff --git a/arch/m68k/include/asm/page_mm.h b/arch/m68k/include/asm/page_mm.h index a5b459bcb7d81..3903db2e8da77 100644 --- a/arch/m68k/include/asm/page_mm.h +++ b/arch/m68k/include/asm/page_mm.h @@ -134,7 +134,6 @@ extern int m68k_virt_to_node_shift; }) #define ARCH_PFN_OFFSET (m68k_memory[0].addr >> PAGE_SHIFT) -#include #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory) #define pfn_valid(pfn) virt_addr_valid(pfn_to_virt(pfn)) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index c9d0d84158a4e..43ff6b109ebbb 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -13,9 +13,8 @@ extern unsigned long memory_end; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) @@ -26,10 +25,6 @@ extern unsigned long memory_end; #define virt_to_page(addr) (mem_map + (((unsigned long)(addr)-PAGE_OFFSET) >> PAGE_SHIFT)) #define page_to_virt(page) __va(((((page) - mem_map) << PAGE_SHIFT) + PAGE_OFFSET)) -#define pfn_to_page(pfn) virt_to_page(pfn_to_virt(pfn)) -#define page_to_pfn(page) virt_to_pfn(page_to_virt(page)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (((unsigned long)(kaddr) >= PAGE_OFFSET) && \ ((unsigned long)(kaddr) < memory_end)) diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fed58da3a6b65..fc044df52b96c 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -31,12 +31,6 @@ extern void paging_init(void); #define swapper_pg_dir ((pgd_t *) 0) -#define __swp_type(x) (0) -#define __swp_offset(x) (0) -#define __swp_entry(typ,off) ((swp_entry_t) { ((typ) | ((off) << 7)) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 90d57e537eb14..e582b0484a55c 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -71,6 +71,9 @@ #define SUN3_PMD_MASK (0x0000003F) #define SUN3_PMD_MAGIC (0x0000002B) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x040 + #ifndef __ASSEMBLY__ /* @@ -152,12 +155,41 @@ static inline pte_t pte_mkcache(pte_t pte) { return pte; } extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; -/* Macros to (de)construct the fake PTEs representing swap pages. */ -#define __swp_type(x) ((x).val & 0x7F) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * 0 <--------------------- offset ----------------> E <- type --> + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x3f) #define __swp_offset(x) (((x).val) >> 7) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) | ((offset) << 7)) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x3f) | \ + (((offset) << 7) & ~SUN3_PAGE_VALID)) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !__ASSEMBLY__ */ #endif /* !_SUN3_PGTABLE_H */ diff --git a/arch/microblaze/include/asm/page.h b/arch/microblaze/include/asm/page.h index 4b8b2fa78fc5f..7b9861bcd4581 100644 --- a/arch/microblaze/include/asm/page.h +++ b/arch/microblaze/include/asm/page.h @@ -112,7 +112,6 @@ extern int page_is_ram(unsigned long pfn); # define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) # define ARCH_PFN_OFFSET (memory_start >> PAGE_SHIFT) -# define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (max_mapnr + ARCH_PFN_OFFSET)) # endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) (pfn_valid(virt_to_pfn(vaddr))) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 42f5988e998b8..d1b8272abcd9b 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -131,10 +131,10 @@ extern pte_t *va_to_pte(unsigned long address); * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 4xx doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 - * is cleared in the TLB miss handler before the TLB entry is loaded. + * - PRESENT *must* be in the bottom two bits because swap PTEs use the top + * 30 bits. Because 4xx doesn't support SMP anyway, M is irrelevant so we + * borrow it for PAGE_PRESENT. Bit 30 is cleared in the TLB miss handler + * before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for * software PTE bits. We actually use bits 21, 24, 25, and @@ -155,6 +155,9 @@ extern pte_t *va_to_pte(unsigned long address); #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ #define _PMD_PRESENT PAGE_MASK +/* We borrow bit 24 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_DIRTY + /* * Some bits are unused... */ @@ -393,18 +396,39 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit, or the _PAGE_HASHPTE bit - * (if used). -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. */ -#define __swp_type(entry) ((entry).val & 0x3f) +#define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 6) #define __swp_entry(type, offset) \ - ((swp_entry_t) { (type) | ((offset) << 6) }) + ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 6) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern unsigned long iopa(unsigned long addr); /* Values for nocacheflag and cmode */ diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 96bc798c1ec1c..5978a8dfb917b 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -224,34 +224,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifdef CONFIG_FLATMEM - -static inline int pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - -#elif defined(CONFIG_SPARSEMEM) - -/* pfn_valid is defined in linux/mmzone.h */ - -#elif defined(CONFIG_NUMA) - -#define pfn_valid(pfn) \ -({ \ - unsigned long __pfn = (pfn); \ - int __n = pfn_to_nid(__pfn); \ - ((__n >= 0) ? (__pfn < NODE_DATA(__n)->node_start_pfn + \ - NODE_DATA(__n)->node_spanned_pages) \ - : 0); \ -}) - -#endif - #define virt_to_pfn(kaddr) PFN_DOWN(virt_to_phys((void *)(kaddr))) #define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index b40a0e69fccc8..ba0016709a1a1 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -191,49 +191,113 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #define pte_page(x) pfn_to_page(pte_pfn(x)) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + */ #if defined(CONFIG_CPU_R3K_TLB) -/* Swap entries must have VALID bit cleared. */ +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------- offset ------------> < type -> V G E 0 0 0 0 0 0 P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 10) & 0x1f) #define __swp_offset(x) ((x).val >> 15) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 10) | ((offset) << 15) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 10) | ((offset) << 15) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else #if defined(CONFIG_XPA) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * 0 0 0 0 0 0 E P <------------------ zeroes -------------------> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> < type -> V G 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 4) & 0x1f) #define __swp_offset(x) ((x).val >> 9) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 4) | ((offset) << 9) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 4) | ((offset) << 9) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 57 (bit 25 in the low PTE) to store the exclusive marker in + * swap PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 25) + #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) -/* Swap entries must have VALID and GLOBAL bits cleared. */ +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <------------------ zeroes -------------------> E P 0 0 0 0 0 0 + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- offset --------------------> < type -> V G + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. + */ #define __swp_type(x) (((x).val >> 2) & 0x1f) #define __swp_offset(x) ((x).val >> 7) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 2) | ((offset) << 7) }) +#define __swp_entry(type, offset) ((swp_entry_t) { (((type) & 0x1f) << 2) | ((offset) << 7) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t) { 0, (x).val }) +/* + * We borrow bit 39 (bit 7 in the low PTE) to store the exclusive marker in swap + * PTEs. + */ +#define _PAGE_SWP_EXCLUSIVE (1 << 7) + #else /* - * Constraints: - * _PAGE_PRESENT at bit 0 - * _PAGE_MODIFIED at bit 4 - * _PAGE_GLOBAL at bit 6 - * _PAGE_VALID at bit 7 + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------- offset --------------> < type -> 0 0 0 0 0 0 E P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P), _PAGE_VALID (V) and_PAGE_GLOBAL (G) have to remain + * unused. The location of V and G varies. */ #define __swp_type(x) (((x).val >> 8) & 0x1f) #define __swp_offset(x) ((x).val >> 13) -#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 8) | ((offset) << 13) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 1) + #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #endif /* defined(CONFIG_CPU_R3K_TLB) */ diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index c6310192b654d..98e24e3e7f2ba 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -320,16 +320,31 @@ extern void pud_init(void *addr); extern void pmd_init(void *addr); /* - * Non-present pages: high 40 bits are offset, next 8 bits type, - * low 16 bits zero. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------> E <-- type ---> <---------- zeroes -----------> + * + * E is the exclusive marker that is not stored in swap entries. */ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) -{ pte_t pte; pte_val(pte) = (type << 16) | (offset << 24); return pte; } +{ pte_t pte; pte_val(pte) = ((type & 0x7f) << 16) | (offset << 24); return pte; } -#define __swp_type(x) (((x).val >> 16) & 0xff) +#define __swp_type(x) (((x).val >> 16) & 0x7f) #define __swp_offset(x) ((x).val >> 24) #define __swp_entry(type, offset) ((swp_entry_t) { pte_val(mk_swap_pte((type), (offset))) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1 << 23) + #endif /* _ASM_PGTABLE_64_H */ diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index a68c0b01d8cdc..791389bf3c124 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -528,6 +528,41 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) } #endif +#if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte.pte_low |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte.pte_low &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#else +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} +#endif extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); diff --git a/arch/nios2/include/asm/page.h b/arch/nios2/include/asm/page.h index 6a989819a7c1d..0ae7d9ce369b9 100644 --- a/arch/nios2/include/asm/page.h +++ b/arch/nios2/include/asm/page.h @@ -86,15 +86,6 @@ extern struct page *mem_map; # define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -static inline bool pfn_valid(unsigned long pfn) -{ - /* avoid include hell */ - extern unsigned long max_mapnr; - unsigned long pfn_offset = ARCH_PFN_OFFSET; - - return pfn >= pfn_offset && pfn < max_mapnr; -} - # define virt_to_page(vaddr) pfn_to_page(PFN_DOWN(virt_to_phys(vaddr))) # define virt_addr_valid(vaddr) pfn_valid(PFN_DOWN(virt_to_phys(vaddr))) diff --git a/arch/nios2/include/asm/pgtable-bits.h b/arch/nios2/include/asm/pgtable-bits.h index bfddff383e89a..724f9b08b1d18 100644 --- a/arch/nios2/include/asm/pgtable-bits.h +++ b/arch/nios2/include/asm/pgtable-bits.h @@ -31,4 +31,7 @@ #define _PAGE_ACCESSED (1<<26) /* page referenced */ #define _PAGE_DIRTY (1<<27) /* dirty page */ +/* We borrow bit 31 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<31) + #endif /* _ASM_NIOS2_PGTABLE_BITS_H */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index ab793bc517f5c..0f5c2564e9f59 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -232,23 +232,44 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) __FILE__, __LINE__, pgd_val(e)) /* - * Encode and decode a swap entry (must be !pte_none(pte) && !pte_present(pte): + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * - * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 ... 1 0 - * 0 0 0 0 type. 0 0 0 0 0 0 offset......... + * Format of swap PTEs: * - * This gives us up to 2**2 = 4 swap files and 2**20 * 4K = 4G per swap file. + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * E < type -> 0 0 0 0 0 0 <-------------- offset ---------------> * - * Note that the offset field is always non-zero, thus !pte_none(pte) is always - * true. + * E is the exclusive marker that is not stored in swap entries. + * + * Note that the offset field is always non-zero if the swap type is 0, thus + * !pte_none() is always true. */ -#define __swp_type(swp) (((swp).val >> 26) & 0x3) +#define __swp_type(swp) (((swp).val >> 26) & 0x1f) #define __swp_offset(swp) ((swp).val & 0xfffff) -#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x3) << 26) \ +#define __swp_entry(type, off) ((swp_entry_t) { (((type) & 0x1f) << 26) \ | ((off) & 0xfffff) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/page.h b/arch/openrisc/include/asm/page.h index aab6e64d6db42..52b0d7e764461 100644 --- a/arch/openrisc/include/asm/page.h +++ b/arch/openrisc/include/asm/page.h @@ -80,8 +80,6 @@ typedef struct page *pgtable_t; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) - #define virt_addr_valid(kaddr) (pfn_valid(virt_to_pfn(kaddr))) #endif /* __ASSEMBLY__ */ diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 6477c17b3062d..3eb9b9555d0df 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -154,6 +154,9 @@ extern void paging_init(void); #define _KERNPG_TABLE \ (_PAGE_BASE | _PAGE_SRE | _PAGE_SWE | _PAGE_ACCESSED | _PAGE_DIRTY) +/* We borrow bit 11 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_U_SHARED + #define PAGE_NONE __pgprot(_PAGE_ALL) #define PAGE_READONLY __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE) #define PAGE_READONLY_X __pgprot(_PAGE_ALL | _PAGE_URE | _PAGE_SRE | _PAGE_EXEC) @@ -385,16 +388,43 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, /* __PHX__ FIXME, SWAP, this probably doesn't work */ -/* Encode and de-code a swap entry (must be !pte_none(e) && !pte_present(e)) */ -/* Since the PAGE_PRESENT bit is bit 4, we can use the bits above */ - -#define __swp_type(x) (((x).val >> 5) & 0x7f) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> E <- type --> 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * The zero'ed bits include _PAGE_PRESENT. + */ +#define __swp_type(x) (((x).val >> 5) & 0x3f) #define __swp_offset(x) ((x).val >> 12) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 12) }) + ((swp_entry_t) { (((type) & 0x3f) << 5) | ((offset) << 12) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h index 6faaaa3ebe9b8..667e703c0e8f6 100644 --- a/arch/parisc/include/asm/page.h +++ b/arch/parisc/include/asm/page.h @@ -155,10 +155,6 @@ extern int npmem_ranges; #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_SPARSEMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT PMD_SHIFT /* fixed for transparent huge pages */ #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index ea357430aafeb..e2950f5db7c9c 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -218,6 +218,9 @@ extern void __update_cache(pte_t pte); #define _PAGE_KERNEL_RWX (_PAGE_KERNEL_EXEC | _PAGE_WRITE) #define _PAGE_KERNEL (_PAGE_KERNEL_RO | _PAGE_WRITE) +/* We borrow bit 23 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + /* The pgd/pmd contains a ptr (in phys addr space); since all pgds/pmds * are page-aligned, we don't care about the PAGE_OFFSET bits, except * for a few meta-information bits, so we shift the address to be @@ -394,17 +397,48 @@ extern void paging_init (void); #define update_mmu_cache(vms,addr,ptep) __update_cache(*ptep) -/* Encode and de-code a swap entry */ - +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <---------------- offset -----------------> P E < type -> + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) must be 0. + * + * For the 64bit version, the offset is extended by 32bit. + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ( (((x).val >> 6) & 0x7) | \ (((x).val >> 8) & ~0x7) ) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | \ +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) & 0x1f) | \ ((offset & 0x7) << 6) | \ ((offset & ~0x7) << 8) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 75823f39e0424..7bf1fe7297c63 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -42,6 +42,9 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) +/* We borrow the _PAGE_USER bit to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + /* And here we include common definitions */ #define _PAGE_KERNEL_RO 0 @@ -363,17 +366,41 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit or the _PAGE_HASHPTE bit (if used). - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <----------------- offset --------------------> < type -> E H P + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_PRESENT (P) and __PAGE_HASHPTE (H) must be 0. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Generic accessors to PTE bits */ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} static inline int pte_read(pte_t pte) { return 1; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb4c67bf45d71..4acc9690f5999 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -717,7 +717,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 70edad44dff6f..fec56d965f00d 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -360,18 +360,30 @@ static inline int pte_young(pte_t pte) #endif #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) + /* - * Encode and decode a swap entry. - * Note that the bits we use in a PTE for representing a swap entry - * must not include the _PAGE_PRESENT bit. - * -- paulus + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs (32bit PTEs): + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <------------------ offset -------------------> < type -> E 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + * + * For 64bit PTEs, the offset is extended by 32bit. */ #define __swp_type(entry) ((entry).val & 0x1f) #define __swp_offset(entry) ((entry).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t) { (type) | ((offset) << 5) }) +#define __swp_entry(type, offset) ((swp_entry_t) { ((type) & 0x1f) | ((offset) << 5) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) +/* We borrow LSB 2 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x000004 + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_POWERPC_NOHASH_32_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 2d3153cfc0d79..6fe46e7545566 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -27,9 +27,9 @@ * of the 16 available. Bit 24-26 of the TLB are cleared in the TLB * miss handler. Bit 27 is PAGE_USER, thus selecting the correct * zone. - * - PRESENT *must* be in the bottom two bits because swap cache - * entries use the top 30 bits. Because 40x doesn't support SMP - * anyway, M is irrelevant so we borrow it for PAGE_PRESENT. Bit 30 + * - PRESENT *must* be in the bottom two bits because swap PTEs + * use the top 30 bits. Because 40x doesn't support SMP anyway, M is + * irrelevant so we borrow it for PAGE_PRESENT. Bit 30 * is cleared in the TLB miss handler before the TLB entry is loaded. * - All other bits of the PTE are loaded into TLBLO without * modification, leaving us only the bits 20, 21, 24, 25, 26, 30 for diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index 78bc304f750ed..b7ed13cee1378 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -56,20 +56,10 @@ * above bits. Note that the bit values are CPU specific, not architecture * specific. * - * The kernel PTE entry holds an arch-dependent swp_entry structure under - * certain situations. In other words, in such situations some portion of - * the PTE bits are used as a swp_entry. In the PPC implementation, the - * 3-24th LSB are shared with swp_entry, however the 0-2nd three LSB still - * hold protection values. That means the three protection bits are - * reserved for both PTE and SWAP entry at the most significant three - * LSBs. - * - * There are three protection bits available for SWAP entry: - * _PAGE_PRESENT - * _PAGE_HASHPTE (if HW has) - * - * So those three bits have to be inside of 0-2nd LSB of PTE. - * + * The kernel PTE entry can be an ordinary PTE mapping a page or a special swap + * PTE. In case of a swap PTE, LSB 2-24 are used to store information regarding + * the swap entry. However LSB 0-1 still hold protection values, for example, + * to distinguish swap PTEs from ordinary PTEs, and must be used with care. */ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 93fb8e11a3f12..16451df5ddb05 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -11,8 +11,8 @@ 32 33 34 35 36 ... 50 51 52 53 54 55 56 57 58 59 60 61 62 63 RPN...................... 0 0 U0 U1 U2 U3 UX SX UW SW UR SR - - PRESENT *must* be in the bottom three bits because swap cache - entries use the top 29 bits. + - PRESENT *must* be in the bottom two bits because swap PTEs use + the top 30 bits. */ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 879e9a6e5a870..287e25864ffae 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -276,22 +276,40 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * <-------------------------- offset ---------------------------- + * + * 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 + * 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 + * --------------> <----------- zero ------------> E < type -> 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define MAX_SWAPFILES_CHECK() do { \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ } while (0) #define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ +#define __swp_type(x) (((x).val >> 2) \ & ((1UL << SWP_TYPE_BITS) - 1)) #define __swp_offset(x) ((x).val >> PTE_RPN_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ + (((type) & 0x1f) << 2) \ | ((offset) << PTE_RPN_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) __pte((x).val) +/* We borrow MSB 56 (LSB 7) to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x80 + int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); void unmap_kernel_page(unsigned long va); extern int __meminit vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 69c3a050a3d85..a6caaaab6f922 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -151,6 +151,21 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + /* Insert a PTE, top-level function is out of line. It uses an inline * low level function in the respective pgtable-* files */ diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 0934e8965e4ed..d8924cbd61e4a 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -12,7 +12,6 @@ /* Architected bits */ #define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ #define _PAGE_SW1 0x000002 -#define _PAGE_BIT_SWAP_TYPE 2 #define _PAGE_BAP_SR 0x000004 #define _PAGE_BAP_UR 0x000008 #define _PAGE_BAP_SW 0x000010 diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index edf1dd1b0ca99..f2b6bf5687d0e 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -117,15 +117,6 @@ extern long long virt_phys_offset; #ifdef CONFIG_FLATMEM #define ARCH_PFN_OFFSET ((unsigned long)(MEMORY_START >> PAGE_SHIFT)) -#ifndef __ASSEMBLY__ -extern unsigned long max_mapnr; -static inline bool pfn_valid(unsigned long pfn) -{ - unsigned long min_pfn = ARCH_PFN_OFFSET; - - return pfn >= min_pfn && pfn < max_mapnr; -} -#endif #endif #define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 0c7cfb9fab04d..fd42059ae2a58 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -132,7 +132,7 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) * address decoding but I'd rather not deal with those outside of the * reserved 64K legacy region. */ - area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + area = __get_vm_area_caller(size, VM_IOREMAP, PHB_IO_BASE, PHB_IO_END, __builtin_return_address(0)); if (!area) return NULL; diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 1d67baa5557a2..709ebd578394b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, { unsigned long gfn = memslot->base_gfn; unsigned long end, start = gfn_to_hva(kvm, gfn); + unsigned long vm_flags; int ret = 0; struct vm_area_struct *vma; int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; @@ -409,12 +410,15 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - merge_flag, &vma->vm_flags); + merge_flag, &vm_flags); if (ret) { ret = H_STATE; break; } + vm_flags_reset(vma, vm_flags); start = vma->vm_end; } while (end > vma->vm_end); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 4f566bea5e10f..712ab91ced398 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -324,7 +324,7 @@ static int kvmppc_xive_native_mmap(struct kvm_device *dev, return -EINVAL; } - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); /* diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index d73b3b4176e81..b75a9fb99599a 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -156,7 +156,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, * VM_NOHUGEPAGE and split them. */ for_each_vma_range(vmi, vma, addr + len) { - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_set(vma, VM_NOHUGEPAGE); walk_page_vma(vma, &subpage_walk_ops, NULL); } } diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 9580e8e12165c..36c21648d19a3 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -525,7 +525,7 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) pfn = paste_addr >> PAGE_SHIFT; /* flags, page_prot from cxl_mmap(), except we want cachable */ - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_cached(vma->vm_page_prot); prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 62d90a5e23d1e..02a8158c469dd 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -291,7 +291,7 @@ static int spufs_mem_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); vma->vm_ops = &spufs_mem_mmap_vmops; @@ -381,7 +381,7 @@ static int spufs_cntl_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_cntl_mmap_vmops; @@ -1043,7 +1043,7 @@ static int spufs_signal1_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal1_mmap_vmops; @@ -1179,7 +1179,7 @@ static int spufs_signal2_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_signal2_mmap_vmops; @@ -1302,7 +1302,7 @@ static int spufs_mss_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mss_mmap_vmops; @@ -1364,7 +1364,7 @@ static int spufs_psmap_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_psmap_mmap_vmops; @@ -1424,7 +1424,7 @@ static int spufs_mfc_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &spufs_mfc_mmap_vmops; diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 9f432c1b52899..7fed7c431928b 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -171,11 +171,6 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) \ - (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) -#endif - #endif /* __ASSEMBLY__ */ #define virt_addr_valid(vaddr) ({ \ diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index b9e13a8fe2b74..f896708e83312 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -27,6 +27,9 @@ */ #define _PAGE_PROT_NONE _PAGE_GLOBAL +/* Used for swap PTEs only. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_ACCESSED + #define _PAGE_PFN_SHIFT 10 /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2a88362dffa57..ebee56d47003e 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -724,16 +724,18 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Encode and decode a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Format of swap PTE: * bit 0: _PAGE_PRESENT (zero) * bit 1 to 3: _PAGE_LEAF (zero) * bit 5: _PAGE_PROT_NONE (zero) - * bits 6 to 10: swap type - * bits 10 to XLEN-1: swap offset + * bit 6: exclusive marker + * bits 7 to 11: swap type + * bits 11 to XLEN-1: swap offset */ -#define __SWP_TYPE_SHIFT 6 +#define __SWP_TYPE_SHIFT 7 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1UL << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT) @@ -744,11 +746,27 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> __SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) \ - { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) + { (((type) & __SWP_TYPE_MASK) << __SWP_TYPE_SHIFT) | \ + ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(swp) __pmd((swp).val) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 61dea67bb9c74..8a2a3b5d1e293 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -73,9 +73,8 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) /* * These are used to make use of C type-checking.. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0f1eba005f6d4..89850522924e7 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -819,7 +819,6 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline int pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 69af6cdf1a2ad..5a716bdcba05b 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2522,8 +2522,7 @@ static inline void thp_split_mm(struct mm_struct *mm) VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; @@ -2588,14 +2587,18 @@ int gmap_mark_unmergeable(void) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + unsigned long vm_flags; int ret; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) { + /* Copy vm_flags to avoid partial modifications in ksm_madvise */ + vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); + MADV_UNMERGEABLE, &vm_flags); if (ret) return ret; + vm_flags_reset(vma, vm_flags); } mm->def_flags &= ~VM_MERGEABLE; return 0; diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index eca5daa43b93d..09ac6c7faee0f 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -169,9 +169,6 @@ typedef struct page *pgtable_t; #define PFN_START (__MEMORY_START >> PAGE_SHIFT) #define ARCH_PFN_OFFSET (PFN_START) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) >= min_low_pfn && (pfn) < max_low_pfn) -#endif #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #include diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index d0240decacca7..21952b0946509 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -423,40 +423,69 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #endif /* - * Encode and de-code a swap entry + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). * * Constraints: * _PAGE_PRESENT at bit 8 * _PAGE_PROTNONE at bit 9 * - * For the normal case, we encode the swap type into bits 0:7 and the - * swap offset into bits 10:30. For the 64-bit PTE case, we keep the - * preserved bits in the low 32-bits and use the upper 32 as the swap - * offset (along with a 5-bit type), following the same approach as x86 - * PAE. This keeps the logic quite simple. + * For the normal case, we encode the swap type and offset into the swap PTE + * such that bits 8 and 9 stay zero. For the 64-bit PTE case, we use the + * upper 32 for the swap offset and swap type, following the same approach as + * x86 PAE. This keeps the logic quite simple. * * As is evident by the Alpha code, if we ever get a 64-bit unsigned * long (swp_entry_t) to match up with the 64-bit PTEs, this all becomes * much cleaner.. - * - * NOTE: We should set ZEROs at the position of _PAGE_PRESENT - * and _PAGE_PROTNONE bits */ + #ifdef CONFIG_X2TLB +/* + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------- offset ----------------------> < type -> + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <------------------- zeroes --------------------> E 0 0 0 0 0 0 + */ #define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 5) -#define __swp_entry(type, offset) ((swp_entry_t){ (type) | (offset) << 5}) +#define __swp_entry(type, offset) ((swp_entry_t){ ((type) & 0x1f) | (offset) << 5}) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) #else -#define __swp_type(x) ((x).val & 0xff) +/* + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> 0 0 0 0 E < type -> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ +#define __swp_type(x) ((x).val & 0x1f) #define __swp_offset(x) ((x).val >> 10) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) <<10}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & 0x1f) | (offset) << 10}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 1 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 1 }) #endif +/* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_USER + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte.pte_low & _PAGE_SWP_EXCLUSIVE; +} + +PTE_BIT_FUNC(low, swp_mkexclusive, |= _PAGE_SWP_EXCLUSIVE); +PTE_BIT_FUNC(low, swp_clear_exclusive, &= ~_PAGE_SWP_EXCLUSIVE); + #endif /* __ASSEMBLY__ */ #endif /* __ASM_SH_PGTABLE_32_H */ diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index a76b94e41e913..27f2e3da5aa22 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,7 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + vma = __get_vm_area_caller(map->size, VM_IOREMAP, map->sq_addr, SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; diff --git a/arch/sparc/include/asm/page_32.h b/arch/sparc/include/asm/page_32.h index fff8861df107a..6be6f683f98f2 100644 --- a/arch/sparc/include/asm/page_32.h +++ b/arch/sparc/include/asm/page_32.h @@ -130,7 +130,6 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) (((pfn) >= (pfn_base)) && (((pfn)-(pfn_base)) < max_mapnr)) #define virt_addr_valid(kaddr) ((((unsigned long)(kaddr)-PAGE_OFFSET)>>PAGE_SHIFT) < max_mapnr) #include diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 5acc05b572e65..d4330e3c57a6e 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -323,7 +323,16 @@ void srmmu_mapiorange(unsigned int bus, unsigned long xpa, unsigned long xva, unsigned int len); void srmmu_unmapiorange(unsigned long virt_addr, unsigned int len); -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <-------------- offset ---------------> < type -> E 0 0 0 0 0 0 + */ static inline unsigned long __swp_type(swp_entry_t entry) { return (entry.val >> SRMMU_SWP_TYPE_SHIFT) & SRMMU_SWP_TYPE_MASK; @@ -344,6 +353,21 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | SRMMU_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~SRMMU_SWP_EXCLUSIVE); +} + static inline unsigned long __get_phys (unsigned long addr) { diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 3bc9736bddb16..2dc8d46417346 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -187,6 +187,9 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V +/* We borrow bit 20 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _AC(0x0000000000100000, UL) + #ifndef __ASSEMBLY__ pte_t mk_pte_io(unsigned long, pgprot_t, int, unsigned long); @@ -961,18 +964,46 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif -/* Encode and de-code a swap entry */ -#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0xffUL) +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * <--------------------------- offset --------------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------> E <-- type ---> <------- zeroes --------> + */ +#define __swp_type(entry) (((entry).val >> PAGE_SHIFT) & 0x7fUL) #define __swp_offset(entry) ((entry).val >> (PAGE_SHIFT + 8UL)) #define __swp_entry(type, offset) \ ( (swp_entry_t) \ { \ - (((long)(type) << PAGE_SHIFT) | \ + ((((long)(type) & 0x7fUL) << PAGE_SHIFT) | \ ((long)(offset) << (PAGE_SHIFT + 8UL))) \ } ) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); +} + int page_in_phys_avail(unsigned long paddr); /* diff --git a/arch/sparc/include/asm/pgtsrmmu.h b/arch/sparc/include/asm/pgtsrmmu.h index 6067925972d9d..18e68d43f0367 100644 --- a/arch/sparc/include/asm/pgtsrmmu.h +++ b/arch/sparc/include/asm/pgtsrmmu.h @@ -53,21 +53,13 @@ #define SRMMU_CHG_MASK (0xffffff00 | SRMMU_REF | SRMMU_DIRTY) -/* SRMMU swap entry encoding - * - * We use 5 bits for the type and 19 for the offset. This gives us - * 32 swapfiles of 4GB each. Encoding looks like: - * - * oooooooooooooooooootttttRRRRRRRR - * fedcba9876543210fedcba9876543210 - * - * The bottom 7 bits are reserved for protection and status bits, especially - * PRESENT. - */ +/* SRMMU swap entry encoding */ #define SRMMU_SWP_TYPE_MASK 0x1f #define SRMMU_SWP_TYPE_SHIFT 7 #define SRMMU_SWP_OFF_MASK 0xfffff #define SRMMU_SWP_OFF_SHIFT (SRMMU_SWP_TYPE_SHIFT + 5) +/* We borrow bit 6 to store the exclusive marker in swap PTEs. */ +#define SRMMU_SWP_EXCLUSIVE SRMMU_DIRTY /* Some day I will implement true fine grained access bits for * user pages because the SRMMU gives us the capabilities to diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index cdbd9653aa14e..84866127d0747 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -108,7 +108,6 @@ extern unsigned long uml_physmem; #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) #define pfn_to_phys(pfn) PFN_PHYS(pfn) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) #include diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 4e3052f2671a0..a70d1618eb35f 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -21,6 +21,9 @@ #define _PAGE_PROTNONE 0x010 /* if the user mapped it with PROT_NONE; pte_present gives true */ +/* We borrow bit 10 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE 0x400 + #ifdef CONFIG_3_LEVEL_PGTABLES #include #else @@ -288,16 +291,45 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); #define update_mmu_cache(vma,address,ptep) do {} while (0) -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <--------------- offset ----------------> E < type -> 0 0 0 1 0 + * + * E is the exclusive marker that is not stored in swap entries. + * _PAGE_NEWPAGE (bit 1) is always set to 1 in set_pte(). + */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 5) | ((offset) << 11) }) + ((swp_entry_t) { (((type) & 0x1f) << 5) | ((offset) << 11) }) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_set_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_clear_bits(pte, _PAGE_SWP_EXCLUSIVE); + return pte; +} + /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 4af81df133ee8..d234ca797e4a7 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -391,7 +391,7 @@ void __init map_vsyscall(void) } if (vsyscall_mode == XONLY) - gate_vma.vm_flags = VM_EXEC; + vm_flags_init(&gate_vma, VM_EXEC); BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 9cc82f305f4bf..d18e5c332cb9f 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#define vma_alloc_zeroed_movable_folio(vma, vaddr) \ + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index df42f8aa99e41..580d71aca65a4 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long); #define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* CONFIG_FLATMEM */ - #include static inline void clear_page(void *page) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 198e03e59ca19..cc6b8e087192e 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define __phys_reloc_hide(x) (x) -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 60d0f90153178..e9482a11ac52d 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * <----------------- offset ------------------> 0 E <- type --> 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) +#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ - & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \ + & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ + (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \ | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + /* No inverted PFNs on 2 level page tables */ static inline u64 protnone_mask(u64 val) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 967b135fa2c01..9e7c0b719c3c1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -145,8 +145,24 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } #endif -/* Encode and de-code a swap entry */ +/* + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). + * + * Format of swap PTEs: + * + * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 + * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 + * < type -> <---------------------- offset ---------------------- + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * --------------------------------------------> 0 E 0 0 0 0 0 0 0 + * + * E is the exclusive marker that is not stored in swap entries. + */ #define SWP_TYPE_BITS 5 +#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) #define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) @@ -154,9 +170,10 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK) #define __swp_offset(x) ((x).val >> SWP_TYPE_BITS) -#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS}) +#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \ + | (offset) << SWP_TYPE_BITS}) /* * Normally, __swp_entry() converts from arch-independent swp_entry_t to @@ -184,6 +201,9 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ __pteval_swp_offset(pte))) +/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE + #include #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1c843395a8b3a..7425f32e52932 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1299,8 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } -#ifdef _PAGE_SWP_EXCLUSIVE -#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE static inline pte_t pte_swp_mkexclusive(pte_t pte) { return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); @@ -1315,7 +1313,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); } -#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b8688676..262f5fb18d74d 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma->vm_ops = &sgx_vm_ops; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); vma->vm_private_data = encl; return 0; diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 6a77a14eee38c..c3e37eaec8ecd 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &sgx_vepc_vm_ops; /* Don't copy VMA in fork() */ - vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY); vma->vm_private_data = vepc; return 0; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5f7d9082c835a..79c12d6795a4f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2616,8 +2616,8 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, return true; } -static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, - u16 port, u16 len) +static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt, + u16 port, u16 len) { if (ctxt->perm_ok) return true; @@ -3962,7 +3962,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) static int check_perm_in(struct x86_emulate_ctxt *ctxt) { ctxt->dst.bytes = min(ctxt->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes)) + if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3971,7 +3971,7 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt) static int check_perm_out(struct x86_emulate_ctxt *ctxt) { ctxt->src.bytes = min(ctxt->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes)) + if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7b0d4ab894c8b..a498ae1fbe665 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,27 +284,6 @@ static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) -{ - __arch_sync_kernel_mappings(start, end); -#ifdef CONFIG_KMSAN - /* - * KMSAN maintains two additional metadata page mappings for the - * [VMALLOC_START, VMALLOC_END) range. These mappings start at - * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and - * have to be synced together with the vmalloc memory mapping. - */ - if (start >= VMALLOC_START && end < VMALLOC_END) { - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, - end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); - __arch_sync_kernel_mappings( - start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, - end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); - } -#endif -} - static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 6d1ba2dda35d6..93ea3682b25d6 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -1000,7 +1000,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, ret = reserve_pfn_range(paddr, size, prot, 0); if (ret == 0 && vma) - vma->vm_flags |= VM_PAT; + vm_flags_set(vma, VM_PAT); return ret; } @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) * can be for the entire vma (in which case pfn, size are zero). */ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size) + unsigned long size, bool mm_wr_locked) { resource_size_t paddr; unsigned long prot; @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - if (vma) - vma->vm_flags &= ~VM_PAT; + if (vma) { + if (mm_wr_locked) + vm_flags_clear(vma, VM_PAT); + else + __vm_flags_mod(vma, 0, VM_PAT); + } } /* @@ -1076,7 +1080,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, */ void untrack_pfn_moved(struct vm_area_struct *vma) { - vma->vm_flags &= ~VM_PAT; + vm_flags_clear(vma, VM_PAT); } pgprot_t pgprot_writecombine(pgprot_t prot) diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index cafd01f730dab..29b2203bc82cb 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC); gate_vma.vm_page_prot = PAGE_READONLY; return 0; diff --git a/arch/xtensa/include/asm/page.h b/arch/xtensa/include/asm/page.h index 493eb7083b1ae..a77d04972eb92 100644 --- a/arch/xtensa/include/asm/page.h +++ b/arch/xtensa/include/asm/page.h @@ -11,6 +11,8 @@ #ifndef _XTENSA_PAGE_H #define _XTENSA_PAGE_H +#include + #include #include #include @@ -189,8 +191,6 @@ static inline unsigned long ___pa(unsigned long va) #endif #define __va(x) \ ((void *)((unsigned long) (x) - PHYS_OFFSET + PAGE_OFFSET)) -#define pfn_valid(pfn) \ - ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define page_to_virt(page) __va(page_to_pfn(page) << PAGE_SHIFT) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 5b5484d707b2e..fc7a14884c6c3 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -96,7 +96,7 @@ * +- - - - - - - - - - - - - - - - - - - - -+ * (PAGE_NONE)| PPN | 0 | 00 | ADW | 01 | 11 | 11 | * +-----------------------------------------+ - * swap | index | type | 01 | 11 | 00 | + * swap | index | type | 01 | 11 | e0 | * +-----------------------------------------+ * * For T1050 hardware and earlier the layout differs for present and (PAGE_NONE) @@ -112,6 +112,7 @@ * RI ring (0=privileged, 1=user, 2 and 3 are unused) * CA cache attribute: 00 bypass, 01 writeback, 10 writethrough * (11 is invalid and used to mark pages that are not present) + * e exclusive marker in swap PTEs * w page is writable (hw) * x page is executable (hw) * index swap offset / PAGE_SIZE (bit 11-31: 21 bits -> 8 GB) @@ -158,6 +159,9 @@ #define _PAGE_DIRTY (1<<7) /* software: page dirty */ #define _PAGE_ACCESSED (1<<8) /* software: page accessed (read) */ +/* We borrow bit 1 to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE (1<<1) + #ifdef CONFIG_MMU #define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -343,19 +347,36 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) } /* - * Encode and decode a swap and file entry. + * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that + * are !pte_none() && !pte_present(). */ -#define SWP_TYPE_BITS 5 -#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(entry) (((entry).val >> 6) & 0x1f) #define __swp_offset(entry) ((entry).val >> 11) #define __swp_entry(type,offs) \ - ((swp_entry_t){((type) << 6) | ((offs) << 11) | \ + ((swp_entry_t){(((type) & 0x1f) << 6) | ((offs) << 11) | \ _PAGE_CA_INVALID | _PAGE_USER}) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + pte_val(pte) |= _PAGE_SWP_EXCLUSIVE; + return pte; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_SWP_EXCLUSIVE; + return pte; +} + #endif /* !defined (__ASSEMBLY__) */ diff --git a/block/bdev.c b/block/bdev.c index edc110d90df40..1795c7d4b99ef 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -304,84 +304,6 @@ int thaw_bdev(struct block_device *bdev) } EXPORT_SYMBOL(thaw_bdev); -/** - * bdev_read_page() - Start reading a page from a block device - * @bdev: The device to read the page from - * @sector: The offset on the device to read the page to (need not be aligned) - * @page: The page to read - * - * On entry, the page should be locked. It will be unlocked when the page - * has been read. If the block driver implements rw_page synchronously, - * that will be true on exit from this function, but it need not be. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to read this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_read_page(struct block_device *bdev, sector_t sector, - struct page *page) -{ - const struct block_device_operations *ops = bdev->bd_disk->fops; - int result = -EOPNOTSUPP; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return result; - - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_READ); - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - -/** - * bdev_write_page() - Start writing a page to a block device - * @bdev: The device to write the page to - * @sector: The offset on the device to write the page to (need not be aligned) - * @page: The page to write - * @wbc: The writeback_control for the write - * - * On entry, the page should be locked and not currently under writeback. - * On exit, if the write started successfully, the page will be unlocked and - * under writeback. If the write failed already (eg the driver failed to - * queue the page to the device), the page will still be locked. If the - * caller is a ->writepage implementation, it will need to unlock the page. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to write this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_write_page(struct block_device *bdev, sector_t sector, - struct page *page, struct writeback_control *wbc) -{ - int result; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - result = blk_queue_enter(bdev_get_queue(bdev), 0); - if (result) - return result; - - set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_WRITE); - if (result) { - end_page_writeback(page); - } else { - clean_page_buffers(page); - unlock_page(page); - } - blk_queue_exit(bdev_get_queue(bdev)); - return result; -} - /* * pseudo-fs */ diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index e6474d38afc49..761a47e89b005 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -2163,7 +2163,7 @@ static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, v { struct hl_ts_buff *ts_buff = buf->private; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE); return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0); } diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 71debe862c865..bb858b94e1e81 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -4238,8 +4238,8 @@ static int gaudi_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index f1f2a58ee68c2..6f415fa94eee9 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -5731,8 +5731,8 @@ static int gaudi2_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); #ifdef _HAS_DMA_MMAP_COHERENT @@ -10375,8 +10375,8 @@ static int gaudi2_block_mmap(struct hl_device *hdev, struct vm_area_struct *vma, address = pci_resource_start(hdev->pdev, SRAM_CFG_BAR_ID) + offset_in_bar; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, block_size, vma->vm_page_prot); diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c index 2b135e856607d..df65e9bdc18aa 100644 --- a/drivers/accel/habanalabs/goya/goya.c +++ b/drivers/accel/habanalabs/goya/goya.c @@ -2880,8 +2880,8 @@ static int goya_mmap(struct hl_device *hdev, struct vm_area_struct *vma, { int rc; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | - VM_DONTCOPY | VM_NORESERVE; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | + VM_DONTCOPY | VM_NORESERVE); rc = dma_mmap_coherent(hdev->dev, vma, cpu_addr, (dma_addr - HOST_PHYS_BASE), size); diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c index d1f923971b4c6..12b219dd4f366 100644 --- a/drivers/accel/ivpu/ivpu_gem.c +++ b/drivers/accel/ivpu/ivpu_gem.c @@ -446,7 +446,7 @@ static int ivpu_bo_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) return dma_buf_mmap(obj->dma_buf, vma, 0); } - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND); vma->vm_page_prot = ivpu_bo_pgprot(bo, vm_get_page_prot(vma->vm_flags)); return 0; diff --git a/drivers/acpi/pfr_telemetry.c b/drivers/acpi/pfr_telemetry.c index 27fb6cdad75f9..843f678ade0c2 100644 --- a/drivers/acpi/pfr_telemetry.c +++ b/drivers/acpi/pfr_telemetry.c @@ -310,7 +310,7 @@ pfrt_log_mmap(struct file *file, struct vm_area_struct *vma) return -EROFS; /* changing from read to write with mprotect is not allowed */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); pfrt_log_dev = to_pfrt_log_dev(file); diff --git a/drivers/android/binder.c b/drivers/android/binder.c index ca3034afd1dc0..fb56bfc45096d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5580,8 +5580,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) proc->pid, vma->vm_start, vma->vm_end, "bad vm_flags", -EPERM); return -EPERM; } - vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_mod(vma, VM_DONTCOPY | VM_MIXEDMAP, VM_MAYWRITE); vma->vm_ops = &binder_vm_ops; vma->vm_private_data = proc; diff --git a/drivers/base/node.c b/drivers/base/node.c index faf3597a96da9..b46db17124f34 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { &node_dev_group, #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP &arch_node_dev_group, +#endif +#ifdef CONFIG_MEMORY_FAILURE + &memory_failure_attr_group, #endif NULL }; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 20acc4a1fd6de..37dce184eb56c 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -309,23 +309,9 @@ static void brd_submit_bio(struct bio *bio) bio_endio(bio); } -static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct brd_device *brd = bdev->bd_disk->private_data; - int err; - - if (PageTransHuge(page)) - return -ENOTSUPP; - err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); - page_endio(page, op_is_write(op), err); - return err; -} - static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, .submit_bio = brd_submit_bio, - .rw_page = brd_rw_page, }; /* @@ -411,6 +397,7 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 5d1088a645e39..25526707f607c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1453,10 +1453,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, /* Slot should be unlocked before the function call */ zram_slot_unlock(zram, index); - /* A null bio means rw_page was used, we must fallback to bio */ - if (!bio) - return -EOPNOTSUPP; - ret = zram_bvec_read_from_bdev(zram, page, index, bio, partial_io); } @@ -2081,61 +2077,6 @@ static void zram_slot_free_notify(struct block_device *bdev, zram_slot_unlock(zram, index); } -static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - int offset, ret; - u32 index; - struct zram *zram; - struct bio_vec bv; - unsigned long start_time; - - if (PageTransHuge(page)) - return -ENOTSUPP; - zram = bdev->bd_disk->private_data; - - if (!valid_io_request(zram, sector, PAGE_SIZE)) { - atomic64_inc(&zram->stats.invalid_io); - ret = -EINVAL; - goto out; - } - - index = sector >> SECTORS_PER_PAGE_SHIFT; - offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - bv.bv_page = page; - bv.bv_len = PAGE_SIZE; - bv.bv_offset = 0; - - start_time = bdev_start_io_acct(bdev->bd_disk->part0, - SECTORS_PER_PAGE, op, jiffies); - ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL); - bdev_end_io_acct(bdev->bd_disk->part0, op, start_time); -out: - /* - * If I/O fails, just return error(ie, non-zero) without - * calling page_endio. - * It causes resubmit the I/O with bio request by upper functions - * of rw_page(e.g., swap_readpage, __swap_writepage) and - * bio->bi_end_io does things to handle the error - * (e.g., SetPageError, set_page_dirty and extra works). - */ - if (unlikely(ret < 0)) - return ret; - - switch (ret) { - case 0: - page_endio(page, op_is_write(op), 0); - break; - case 1: - ret = 0; - break; - default: - WARN_ON(1); - } - return ret; -} - static void zram_destroy_comps(struct zram *zram) { u32 prio; @@ -2290,7 +2231,6 @@ static const struct block_device_operations zram_devops = { .open = zram_open, .submit_bio = zram_submit_bio, .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, .owner = THIS_MODULE }; @@ -2389,6 +2329,7 @@ static int zram_add(void) set_capacity(zram->disk, 0); /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f8231e2e84bec..b35f651837c83 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -206,7 +206,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, refcount_set(&vdata->refcnt, 1); vma->vm_private_data = vdata; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); if (vdata->type == MSPEC_UNCACHED) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &mspec_vm_ops; diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 58d023f6fba39..6d93d61254752 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2373,7 +2373,7 @@ static int hisi_qm_uacce_mmap(struct uacce_queue *q, return -EINVAL; } - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); return remap_pfn_range(vma, vma->vm_start, phys_base >> PAGE_SHIFT, diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 5494d745ced58..223e4e233d19f 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -308,7 +308,7 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma) return rc; vma->vm_ops = &dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e13e92609943d..674bfefca088d 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -201,7 +201,7 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (rc < 0) return rc; - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); pfn = (base + idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED)) >> PAGE_SHIFT; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index ed1164a87fced..d8e683688daab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -258,7 +258,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str */ if (is_cow_mapping(vma->vm_flags) && !(vma->vm_flags & VM_ACCESS_FLAGS)) - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return drm_gem_ttm_mmap(obj, vma); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f79b8e964140e..81153a201ea43 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -2884,8 +2884,8 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, address = dev->adev->rmmio_remap.bus_addr; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index cd4e61bf04939..cbef2e147da58 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -159,8 +159,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, address = kfd_get_process_doorbells(pdd); if (!address) return -ENOMEM; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | - VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 729d26d648af3..dd0436bf349a4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1052,8 +1052,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) pfn = __pa(page->kernel_address); pfn >>= PAGE_SHIFT; - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE - | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE + | VM_DONTDUMP | VM_PFNMAP); pr_debug("Mapping signal page\n"); pr_debug(" start user address == 0x%08lx\n", vma->vm_start); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 72df6286e2407..7acd55a814b2f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1986,8 +1986,8 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, return -ENOMEM; } - vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND - | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_DONTCOPY | VM_DONTEXPAND + | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP); /* Mapping pages to user process */ return remap_pfn_range(vma, vma->vm_start, PFN_DOWN(__pa(qpd->cwsr_kaddr)), diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 59a0bb5ebd852..7a3cb08dc942e 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -1060,7 +1060,7 @@ int drm_gem_mmap_obj(struct drm_gem_object *obj, unsigned long obj_size, goto err_drm_gem_object_put; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); } diff --git a/drivers/gpu/drm/drm_gem_dma_helper.c b/drivers/gpu/drm/drm_gem_dma_helper.c index df89fbd2d35cd..870b90b78bc4e 100644 --- a/drivers/gpu/drm/drm_gem_dma_helper.c +++ b/drivers/gpu/drm/drm_gem_dma_helper.c @@ -530,8 +530,7 @@ int drm_gem_dma_mmap(struct drm_gem_dma_object *dma_obj, struct vm_area_struct * * the whole buffer. */ vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTEXPAND, VM_PFNMAP); if (dma_obj->map_noncoherent) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c index 4b828ea7119c1..916bdced40c09 100644 --- a/drivers/gpu/drm/drm_gem_shmem_helper.c +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c @@ -630,7 +630,7 @@ int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct if (ret) return ret; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); if (shmem->map_wc) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index f024dc93939ef..87c9fe55dec76 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -476,7 +476,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) if (!capable(CAP_SYS_ADMIN) && (dma->flags & _DRM_DMA_USE_PCI_RO)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -492,7 +492,7 @@ static int drm_mmap_dma(struct file *filp, struct vm_area_struct *vma) vma->vm_ops = &drm_vm_dma_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; @@ -560,7 +560,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) return -EINVAL; if (!capable(CAP_SYS_ADMIN) && (map->flags & _DRM_READ_ONLY)) { - vma->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_clear(vma, VM_WRITE | VM_MAYWRITE); #if defined(__i386__) || defined(__x86_64__) pgprot_val(vma->vm_page_prot) &= ~_PAGE_RW; #else @@ -628,7 +628,7 @@ static int drm_mmap_locked(struct file *filp, struct vm_area_struct *vma) default: return -EINVAL; /* This should never happen. */ } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); drm_vm_open_locked(dev, vma); return 0; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index c5ae5492e1af0..b5f73502e3dd4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -130,7 +130,7 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj, { pgprot_t vm_page_prot; - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vm_page_prot = vm_get_page_prot(vma->vm_flags); diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 3e493f48e0d44..638ca96830e9d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -274,7 +274,7 @@ static int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem *exynos_gem, unsigned long vm_size; int ret; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; vm_size = vma->vm_end - vma->vm_start; @@ -368,7 +368,7 @@ static int exynos_drm_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct if (obj->import_attach) return dma_buf_mmap(obj->dma_buf, vma, 0); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); DRM_DEV_DEBUG_KMS(to_dma_dev(obj->dev), "flags = 0x%x\n", exynos_gem->flags); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 1f04c07ee180b..1c63283ab7e97 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -141,7 +141,7 @@ static int psbfb_mmap(struct fb_info *info, struct vm_area_struct *vma) */ vma->vm_ops = &psbfb_vm_ops; vma->vm_private_data = (void *)fb; - vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index 2aac6bf787403..d3c1dee16af2b 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -979,7 +979,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) i915_gem_object_put(obj); return -EINVAL; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } anon = mmap_singleton(to_i915(dev)); @@ -988,7 +988,7 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) return PTR_ERR(anon); } - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO); /* * We keep the ref on mmo->obj, not vm_file, but we require diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c index ec0518aa93152..a25b28d3ee902 100644 --- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c +++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c @@ -163,7 +163,7 @@ static int mtk_drm_gem_object_mmap(struct drm_gem_object *obj, * dma_alloc_attrs() allocated a struct page table for mtk_gem, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 1dee0d18abbb6..c2fb98a94bc3e 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1012,7 +1012,7 @@ static int msm_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_struct { struct msm_gem_object *msm_obj = to_msm_bo(obj); - vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = msm_gem_pgprot(msm_obj, vm_get_page_prot(vma->vm_flags)); return 0; diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d6b4934fa0fd6..6b58a5bb7b44b 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -543,8 +543,7 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj, { struct omap_gem_object *omap_obj = to_omap_bo(obj); - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); if (omap_obj->flags & OMAP_BO_WC) { vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index 6edb7c52cb3dc..8ea09d915c3ca 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -251,8 +251,7 @@ static int rockchip_drm_gem_object_mmap(struct drm_gem_object *obj, * We allocated a struct page table for rk_obj, so clear * VM_PFNMAP flag that was set by drm_gem_mmap_obj()/drm_gem_mmap(). */ - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 979e7bc902f6a..bce991a2ccc0b 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -574,7 +574,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) * and set the vm_pgoff (used as a fake buffer offset by DRM) * to 0 as we want to map the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_clear(vma, VM_PFNMAP); vma->vm_pgoff = 0; err = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->iova, @@ -588,8 +588,7 @@ int __tegra_gem_mmap(struct drm_gem_object *gem, struct vm_area_struct *vma) } else { pgprot_t prot = vm_get_page_prot(vma->vm_flags); - vma->vm_flags |= VM_MIXEDMAP; - vma->vm_flags &= ~VM_PFNMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); vma->vm_page_prot = pgprot_writecombine(prot); } diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 3ecda6db24b84..ca7744b852f5e 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -471,8 +471,7 @@ int ttm_bo_mmap_obj(struct vm_area_struct *vma, struct ttm_buffer_object *bo) vma->vm_private_data = bo; - vma->vm_flags |= VM_PFNMAP; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(ttm_bo_mmap_obj); diff --git a/drivers/gpu/drm/virtio/virtgpu_vram.c b/drivers/gpu/drm/virtio/virtgpu_vram.c index 6b45b0429fef9..25df81c027837 100644 --- a/drivers/gpu/drm/virtio/virtgpu_vram.c +++ b/drivers/gpu/drm/virtio/virtgpu_vram.c @@ -46,7 +46,7 @@ static int virtio_gpu_vram_mmap(struct drm_gem_object *obj, return -EINVAL; vma->vm_pgoff -= drm_vma_node_start(&obj->vma_node); - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_set(vma, VM_MIXEDMAP | VM_DONTEXPAND); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); vma->vm_ops = &virtio_gpu_vram_vm_ops; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c index 265f7c48d856e..90097d04b45f3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c @@ -97,7 +97,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma) /* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */ if (!is_cow_mapping(vma->vm_flags)) - vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP; + vm_flags_mod(vma, VM_PFNMAP, VM_MIXEDMAP); ttm_bo_put(bo); /* release extra ref taken by ttm_bo_mmap_obj() */ diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 4c95ebcdcc2d3..3ad2b4cfd1f0b 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -69,8 +69,7 @@ static int xen_drm_front_gem_object_mmap(struct drm_gem_object *gem_obj, * vm_pgoff (used as a fake buffer offset by DRM) to 0 as we want to map * the whole buffer. */ - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND; + vm_flags_mod(vma, VM_MIXEDMAP | VM_DONTEXPAND, VM_PFNMAP); vma->vm_pgoff = 0; /* diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index 8069f795c8649..daa8e1bff5d9d 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1264,7 +1264,7 @@ static int cs_char_mmap(struct file *file, struct vm_area_struct *vma) if (vma_pages(vma) != 1) return -EINVAL; - vma->vm_flags |= VM_IO | VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTDUMP | VM_DONTEXPAND); vma->vm_ops = &cs_char_vm_ops; vma->vm_private_data = file->private_data; diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index 6c8215a47a601..9621efe0e95c4 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1659,7 +1659,7 @@ static int intel_th_msc_mmap(struct file *file, struct vm_area_struct *vma) atomic_dec(&msc->user_count); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTCOPY; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &msc_mmap_ops; return ret; } diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index 2712e699ba08c..534fbefc7f6aa 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -715,7 +715,7 @@ static int stm_char_mmap(struct file *file, struct vm_area_struct *vma) pm_runtime_get_sync(&stm->dev); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &stm_mmap_vmops; vm_iomap_memory(vma, phys, size); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index e03b0207856e7..b1d6ca7e97083 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -424,7 +424,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EPERM; goto done; } - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* * Mmap multiple separate allocations into a single vma. From * here, dma_mmap_coherent() calls dma_direct_mmap(), which @@ -560,7 +560,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } - vma->vm_flags = flags; + vm_flags_reset(vma, flags); mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma, memlen, vma); if (vmf) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f644b2e671dd6..a6a081ccb2656 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2087,7 +2087,7 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (!dev->mdev->clock_info) return -EOPNOTSUPP; @@ -2311,7 +2311,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); /* Don't expose to user-space information it shouldn't have */ if (PAGE_SIZE > 4096) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 3937144b2ae5a..80fe92a21f960 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -733,7 +733,7 @@ static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd, } /* don't allow them to later change with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; @@ -769,7 +769,7 @@ static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd, phys = dd->physaddr + ureg; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, vma->vm_end - vma->vm_start, @@ -810,8 +810,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, * don't allow them to later change to readable with mprotect (for when * not initially mapped readable, as is normally the case) */ - vma->vm_flags &= ~VM_MAYREAD; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND, VM_MAYREAD); /* We used PAT if wc_cookie == 0 */ if (!dd->wc_cookie) @@ -852,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, goto bail; } /* don't allow them to later change to writable with mprotect */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); start = vma->vm_start; @@ -944,7 +943,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, * Don't allow permission to later change to writable * with mprotect. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } else goto bail; len = vma->vm_end - vma->vm_start; @@ -955,7 +954,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; vma->vm_ops = &qib_file_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); ret = 1; bail: diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 6e8c4fbb80834..6289238cc5af8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -672,7 +672,7 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_dbg("\n"); us_ibdev = to_usdev(context->device); - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vfid = vma->vm_pgoff; usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index 19176583dbde6..9f54aa90a35a6 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -408,7 +408,7 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) } /* Map UAR to kernel space, VM_LOCKED? */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, start, context->uar.pfn, size, vma->vm_page_prot)) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 620ae5b2d80dc..6b76037653839 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -446,7 +446,7 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, * @is_leading: indicate if this is the session leading connection (MCS) * * Return: zero on success, $error if iscsi_conn_bind fails and - * -EINVAL in case end-point doesn't exsits anymore or iser connection + * -EINVAL in case end-point doesn't exists anymore or iser connection * state is not UP (teardown already started). */ static int iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 4c4ac22d5fb17..1cbf063ccf147 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -672,12 +672,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfpflags_allow_blocking(gfp)) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 5f1175f8b3498..205d3cac425cf 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -293,7 +293,7 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return ret; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = &buf->handler; vma->vm_ops = &vb2_common_vm_ops; diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 959b45beb1f3e..a6c6d2fcaaa46 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -185,7 +185,7 @@ static int vb2_vmalloc_mmap(void *buf_priv, struct vm_area_struct *vma) /* * Make sure that vm_areas for 2 buffers won't be merged together */ - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); /* * Use common vm_area operations to track buffer refcount. diff --git a/drivers/media/v4l2-core/videobuf-dma-contig.c b/drivers/media/v4l2-core/videobuf-dma-contig.c index f2c4393595574..4c2ec7a0d804a 100644 --- a/drivers/media/v4l2-core/videobuf-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf-dma-contig.c @@ -314,7 +314,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = map; dev_dbg(q->dev, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 234e9f647c963..53001532e8e30 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -630,8 +630,8 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, map->count = 1; map->q = q; vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_flags &= ~VM_IO; /* using shared anonymous pages */ + /* using shared anonymous pages */ + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_IO); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx pgoff %08lx bufs %d-%d\n", map, q, vma->vm_start, vma->vm_end, vma->vm_pgoff, first, last); diff --git a/drivers/media/v4l2-core/videobuf-vmalloc.c b/drivers/media/v4l2-core/videobuf-vmalloc.c index 9b2443720ab01..85c7090606d60 100644 --- a/drivers/media/v4l2-core/videobuf-vmalloc.c +++ b/drivers/media/v4l2-core/videobuf-vmalloc.c @@ -247,7 +247,7 @@ static int __videobuf_mmap_mapper(struct videobuf_queue *q, } vma->vm_ops = &videobuf_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = map; dprintk(1, "mmap %p: q=%p %08lx-%08lx (%lx) pgoff %08lx buf %d\n", diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index acaa44809c58a..76b5ea66dfa1f 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -220,7 +220,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma) pr_devel("%s: mmio physical: %llx pe: %i master:%i\n", __func__, ctx->psn_phys, ctx->pe , ctx->master); - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &cxl_mmap_vmops; return 0; diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index 9eb0d93b01c67..7f83116ae11a6 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -180,7 +180,7 @@ static int check_mmap_afu_irq(struct ocxl_context *ctx, if ((vma->vm_flags & VM_READ) || (vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_WRITE)) return -EINVAL; - vma->vm_flags &= ~(VM_MAYREAD | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYREAD | VM_MAYEXEC); return 0; } @@ -204,7 +204,7 @@ int ocxl_context_mmap(struct ocxl_context *ctx, struct vm_area_struct *vma) if (rc) return rc; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxl_vmops; return 0; diff --git a/drivers/misc/ocxl/sysfs.c b/drivers/misc/ocxl/sysfs.c index 25c78df8055d3..405180d47d9bf 100644 --- a/drivers/misc/ocxl/sysfs.c +++ b/drivers/misc/ocxl/sysfs.c @@ -134,7 +134,7 @@ static int global_mmio_mmap(struct file *filp, struct kobject *kobj, (afu->config.global_mmio_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &global_mmio_vmops; vma->vm_private_data = afu; diff --git a/drivers/misc/open-dice.c b/drivers/misc/open-dice.c index 9dda47b3fd709..8aea2d070a40c 100644 --- a/drivers/misc/open-dice.c +++ b/drivers/misc/open-dice.c @@ -95,12 +95,12 @@ static int open_dice_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; /* Ensure userspace cannot acquire VM_WRITE later. */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* Create write-combine mapping so all clients observe a wipe. */ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTDUMP); return vm_iomap_memory(vma, drvdata->rmem->base, drvdata->rmem->size); } diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 7ffcfc0bb5872..a3d659c11cc4e 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -101,8 +101,8 @@ static int gru_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_end & (GRU_GSEG_PAGESIZE - 1)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_LOCKED | - VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_LOCKED | + VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = PAGE_SHARED; vma->vm_ops = &gru_vm_ops; diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index d3a217929a248..07023397afc73 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -229,7 +229,7 @@ static int uacce_fops_mmap(struct file *filep, struct vm_area_struct *vma) if (!qfr) return -ENOMEM; - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_WIPEONFORK); vma->vm_ops = &uacce_vm_ops; vma->vm_private_data = q; qfr->type = type; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 0297b7882e33b..d5593b0dc7009 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1482,20 +1482,6 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct btt *btt = bdev->bd_disk->private_data; - int rc; - - rc = btt_do_bvec(btt, NULL, page, thp_size(page), 0, op, sector); - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return rc; -} - - static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) { /* some standard values */ @@ -1508,7 +1494,6 @@ static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) static const struct block_device_operations btt_fops = { .owner = THIS_MODULE, .submit_bio = btt_submit_bio, - .rw_page = btt_rw_page, .getgeo = btt_getgeo, }; @@ -1530,6 +1515,7 @@ static int btt_blk_init(struct btt *btt) blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); if (btt_meta_size(btt)) { rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 96e6e9a5f235d..ceea55f621cc7 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -238,28 +238,6 @@ static void pmem_submit_bio(struct bio *bio) bio_endio(bio); } -static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, enum req_op op) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - blk_status_t rc; - - if (op_is_write(op)) - rc = pmem_do_write(pmem, page, 0, sector, thp_size(page)); - else - rc = pmem_do_read(pmem, page, 0, sector, thp_size(page)); - /* - * The ->rw_page interface is subtle and tricky. The core - * retries on any error, so we can only invoke page_endio() in - * the successful completion case. Otherwise, we'll see crashes - * caused by double completion. - */ - if (rc == 0) - page_endio(page, op_is_write(op), 0); - - return blk_status_to_errno(rc); -} - /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, @@ -310,7 +288,6 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, .submit_bio = pmem_submit_bio, - .rw_page = pmem_rw_page, }; static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, @@ -565,6 +542,7 @@ static int pmem_attach_disk(struct device *dev, blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index 21b7cb6e7e705..e300cf26bc2a8 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -389,7 +389,7 @@ static int dax_devmap(struct file *f, struct vm_area_struct *vma) /* completion area is mapped read-only for user */ if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (remap_pfn_range(vma, vma->vm_start, ctx->ca_buf_ra >> PAGE_SHIFT, len, vma->vm_page_prot)) diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c index 631eda2d467e2..6542818e595a6 100644 --- a/drivers/scsi/cxlflash/ocxl_hw.c +++ b/drivers/scsi/cxlflash/ocxl_hw.c @@ -1167,7 +1167,7 @@ static int afu_mmap(struct file *file, struct vm_area_struct *vma) (ctx->psn_size >> PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP; + vm_flags_set(vma, VM_IO | VM_PFNMAP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_ops = &ocxlflash_vmops; return 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ff9854f599649..a91049213203f 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1288,7 +1288,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) } sfp->mmap_called = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; out: diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c index 5e53eed8ae951..095cd0ba8c216 100644 --- a/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c +++ b/drivers/staging/media/atomisp/pci/hmm/hmm_bo.c @@ -1072,7 +1072,7 @@ int hmm_bo_mmap(struct vm_area_struct *vma, struct hmm_buffer_object *bo) vma->vm_private_data = bo; vma->vm_ops = &hmm_bo_vm_ops; - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); /* * call hmm_bo_vm_open explicitly. diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2940559c30860..15ffc8d2ac7ba 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1928,7 +1928,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &tcmu_vm_ops; vma->vm_private_data = udev; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 43afbb7c5ab91..62082d64ece00 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -713,7 +713,7 @@ static const struct vm_operations_struct uio_logical_vm_ops = { static int uio_mmap_logical(struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &uio_logical_vm_ops; return 0; } diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 837f3e57f5809..e501a03d6c700 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -279,8 +279,7 @@ static int usbdev_mmap(struct file *file, struct vm_area_struct *vma) } } - vma->vm_flags |= VM_IO; - vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP); + vm_flags_set(vma, VM_IO | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &usbdev_vm_ops; vma->vm_private_data = usbm; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 094e812e9e692..abb1cd35d8a67 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1272,8 +1272,7 @@ static int mon_bin_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE) return -EPERM; - vma->vm_flags &= ~VM_MAYWRITE; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_mod(vma, VM_DONTEXPAND | VM_DONTDUMP, VM_MAYWRITE); vma->vm_private_data = filp->private_data; mon_bin_vma_open(vma); return 0; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index e682bc7ee6c99..5e4a77b9bae6b 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -512,7 +512,7 @@ static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) { struct vduse_iova_domain *domain = file->private_data; - vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTDUMP | VM_DONTEXPAND); vma->vm_private_data = domain; vma->vm_ops = &vduse_domain_mmap_ops; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a6492a25ff6a6..a5ab416cf476c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1800,7 +1800,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * See remap_pfn_range(), called from vfio_pci_fault() but we can't * change vm_flags within the fault handler. Set them now. */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vfio_pci_mmap_ops; return 0; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index d1ea79b1bc321..884612f2362c1 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1360,7 +1360,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c index 7db03ed77c762..41df61b37a18d 100644 --- a/drivers/video/fbdev/68328fb.c +++ b/drivers/video/fbdev/68328fb.c @@ -391,7 +391,7 @@ static int mc68x328fb_mmap(struct fb_info *info, struct vm_area_struct *vma) #ifndef MMU /* this is uClinux (no MMU) specific code */ - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_start = videomemory; return 0; diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 1b680742b7f3b..4475cb60ee960 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -228,9 +228,9 @@ static const struct address_space_operations fb_deferred_io_aops = { int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) { vma->vm_ops = &fb_deferred_io_vm_ops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) - vma->vm_flags |= VM_IO; + vm_flags_set(vma, VM_IO); vma->vm_private_data = info; return 0; } diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index a15729beb9d1b..26ffb8755ffb5 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -525,7 +525,7 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &gntalloc_vmops; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4d9a3050de6a3..61faea1f06630 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1055,10 +1055,10 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) vma->vm_ops = &gntdev_vmops; - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP); if (use_ptemod) - vma->vm_flags |= VM_DONTCOPY; + vm_flags_set(vma, VM_DONTCOPY); vma->vm_private_data = map; if (map->flags) { diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index dd5bbb6e1b6b9..2fa10ca5be14c 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -156,7 +156,7 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) vma_priv->file_priv = file_priv; vma_priv->users = 1; - vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vm_flags_set(vma, VM_IO | VM_DONTEXPAND); vma->vm_ops = &privcmd_buf_vm_ops; vma->vm_private_data = vma_priv; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1edf45ee9890d..e2f580e30a861 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -934,8 +934,8 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) { /* DONTCOPY is essential for Xen because copy_page_range doesn't know * how to recreate these mappings */ - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | - VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &privcmd_vm_ops; vma->vm_private_data = NULL; diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323d..2d3b08b7406ca 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -704,85 +704,87 @@ static int afs_writepages_region(struct address_space *mapping, bool max_one_loop) { struct folio *folio; - struct page *head_page; + struct folio_batch fbatch; ssize_t ret; + unsigned int i; int n, skips = 0; _enter("%llx,%llx,", start, end); + folio_batch_init(&fbatch); do { pgoff_t index = start / PAGE_SIZE; - n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &head_page); + n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!n) break; + for (i = 0; i < n; i++) { + folio = fbatch.folios[i]; + start = folio_pos(folio); /* May regress with THPs */ - folio = page_folio(head_page); - start = folio_pos(folio); /* May regress with THPs */ - - _debug("wback %lx", folio_index(folio)); + _debug("wback %lx", folio_index(folio)); - /* At this point we hold neither the i_pages lock nor the - * page lock: the page may be truncated or invalidated - * (changing page->mapping to NULL), or even swizzled - * back from swapper_space to tmpfs file mapping - */ - if (wbc->sync_mode != WB_SYNC_NONE) { - ret = folio_lock_killable(folio); - if (ret < 0) { - folio_put(folio); - return ret; - } - } else { - if (!folio_trylock(folio)) { - folio_put(folio); - return 0; + /* At this point we hold neither the i_pages lock nor the + * page lock: the page may be truncated or invalidated + * (changing page->mapping to NULL), or even swizzled + * back from swapper_space to tmpfs file mapping + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = folio_lock_killable(folio); + if (ret < 0) { + folio_batch_release(&fbatch); + return ret; + } + } else { + if (!folio_trylock(folio)) + continue; } - } - if (folio_mapping(folio) != mapping || - !folio_test_dirty(folio)) { - start += folio_size(folio); - folio_unlock(folio); - folio_put(folio); - continue; - } + if (folio->mapping != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + continue; + } - if (folio_test_writeback(folio) || - folio_test_fscache(folio)) { - folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) { - folio_wait_writeback(folio); + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { + folio_unlock(folio); + if (wbc->sync_mode != WB_SYNC_NONE) { + folio_wait_writeback(folio); #ifdef CONFIG_AFS_FSCACHE - folio_wait_fscache(folio); + folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + } else { + start += folio_size(folio); + } + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) { + *_next = start; + _leave(" = 0 [%llx]", *_next); + return 0; + } + skips++; + } + continue; } - folio_put(folio); - if (wbc->sync_mode == WB_SYNC_NONE) { - if (skips >= 5 || need_resched()) - break; - skips++; + + if (!folio_clear_dirty_for_io(folio)) + BUG(); + ret = afs_write_back_from_locked_folio(mapping, wbc, + folio, start, end); + if (ret < 0) { + _leave(" = %zd", ret); + folio_batch_release(&fbatch); + return ret; } - continue; - } - if (!folio_clear_dirty_for_io(folio)) - BUG(); - ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); - folio_put(folio); - if (ret < 0) { - _leave(" = %zd", ret); - return ret; + start += ret; } - start += ret; - - if (max_one_loop) - break; - + folio_batch_release(&fbatch); cond_resched(); } while (wbc->nr_to_write > 0); diff --git a/fs/aio.c b/fs/aio.c index e85ba0b77f596..b0b17bd098bbc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -394,7 +394,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_ops = &aio_ring_vm_ops; return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 574ecbc8be6cb..6d1b7e7049cf1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2410,14 +2410,14 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2440,14 +2440,15 @@ int btree_write_cache_pages(struct address_space *mapping, if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, + &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2462,7 +2463,7 @@ int btree_write_cache_pages(struct address_space *mapping, */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { @@ -2537,8 +2538,8 @@ static int extent_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -2558,7 +2559,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2596,14 +2597,14 @@ static int extent_write_cache_pages(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, - &index, end, tag))) { + (nr_folios = filemap_get_folios_tag(mapping, &index, + end, tag, &fbatch))) { unsigned i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index + 1; + done_index = folio->index + folio_nr_pages(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -2611,29 +2612,29 @@ static int extent_write_cache_pages(struct address_space *mapping, * or even swizzled back from swapper_space to * tmpfs file mapping */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); - lock_page(page); + folio_lock(folio); } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) + if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); - wait_on_page_writeback(page); + folio_wait_writeback(folio); } - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); + if (folio_test_writeback(folio) || + !folio_clear_dirty_for_io(folio)) { + folio_unlock(folio); continue; } - ret = __extent_writepage(page, wbc, bio_ctrl); + ret = __extent_writepage(&folio->page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -2646,7 +2647,7 @@ static int extent_write_cache_pages(struct address_space *mapping, */ nr_to_write_done = wbc->nr_to_write <= 0; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } if (!scanned && !done) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cac4083e387a5..d5335f4452336 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -800,7 +800,7 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct pagevec pvec; + struct folio_batch fbatch; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; @@ -829,7 +829,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec); + folio_batch_init(&fbatch); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; @@ -877,7 +877,7 @@ static int ceph_writepages_start(struct address_space *mapping, while (!done && index <= end) { int num_ops = 0, op_idx; - unsigned i, pvec_pages, max_pages, locked_pages = 0; + unsigned i, nr_folios, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; struct page *page; pgoff_t strip_unit_end = 0; @@ -887,13 +887,13 @@ static int ceph_writepages_start(struct address_space *mapping, max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY); - dout("pagevec_lookup_range_tag got %d\n", pvec_pages); - if (!pvec_pages && !locked_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, + end, PAGECACHE_TAG_DIRTY, &fbatch); + dout("pagevec_lookup_range_tag got %d\n", nr_folios); + if (!nr_folios && !locked_pages) break; - for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { - page = pvec.pages[i]; + for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { + page = &fbatch.folios[i]->page; dout("? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ @@ -1003,7 +1003,7 @@ static int ceph_writepages_start(struct address_space *mapping, len = 0; } - /* note position of first page in pvec */ + /* note position of first page in fbatch */ dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -1013,30 +1013,30 @@ static int ceph_writepages_start(struct address_space *mapping, fsc->write_congested = true; pages[locked_pages++] = page; - pvec.pages[i] = NULL; + fbatch.folios[i] = NULL; len += thp_size(page); } /* did we get anything? */ if (!locked_pages) - goto release_pvec_pages; + goto release_folios; if (i) { unsigned j, n = 0; - /* shift unused page to beginning of pvec */ - for (j = 0; j < pvec_pages; j++) { - if (!pvec.pages[j]) + /* shift unused page to beginning of fbatch */ + for (j = 0; j < nr_folios; j++) { + if (!fbatch.folios[j]) continue; if (n < j) - pvec.pages[n] = pvec.pages[j]; + fbatch.folios[n] = fbatch.folios[j]; n++; } - pvec.nr = n; + fbatch.nr = n; - if (pvec_pages && i == pvec_pages && + if (nr_folios && i == nr_folios && locked_pages < max_pages) { - dout("reached end pvec, trying for more\n"); - pagevec_release(&pvec); + dout("reached end fbatch, trying for more\n"); + folio_batch_release(&fbatch); goto get_more_pages; } } @@ -1172,10 +1172,10 @@ static int ceph_writepages_start(struct address_space *mapping, if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) done = true; -release_pvec_pages: - dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, - pvec.nr ? pvec.pages[0] : NULL); - pagevec_release(&pvec); +release_folios: + dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr, + fbatch.nr ? fbatch.folios[0] : NULL); + folio_batch_release(&fbatch); } if (should_loop && !done) { @@ -1192,15 +1192,17 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned i, nr; index = 0; while ((index <= end) && - (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK))) { + (nr = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &fbatch))) { for (i = 0; i < nr; i++) { - page = pvec.pages[i]; + page = &fbatch.folios[i]->page; if (page_snap_context(page) != snapc) continue; wait_on_page_writeback(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1d9cc59d82592..5568a5f4bc5a7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2528,14 +2528,40 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, unsigned int *found_pages) { struct cifs_writedata *wdata; - + struct folio_batch fbatch; + unsigned int i, idx, p, nr; wdata = cifs_writedata_alloc((unsigned int)tofind, cifs_writev_complete); if (!wdata) return NULL; - *found_pages = find_get_pages_range_tag(mapping, index, end, - PAGECACHE_TAG_DIRTY, tofind, wdata->pages); + folio_batch_init(&fbatch); + *found_pages = 0; + +again: + nr = filemap_get_folios_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, &fbatch); + if (!nr) + goto out; /* No dirty pages left in the range */ + + for (i = 0; i < nr; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + wdata->pages[*found_pages] = folio_page(folio, idx); + folio_get(folio); + if (++*found_pages == tofind) { + folio_batch_release(&fbatch); + goto out; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +out: return wdata; } @@ -2650,14 +2676,14 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages, static int cifs_writepage_locked(struct page *page, struct writeback_control *wbc); -static int cifs_write_one_page(struct page *page, struct writeback_control *wbc, - void *data) +static int cifs_write_one_page(struct folio *folio, + struct writeback_control *wbc, void *data) { struct address_space *mapping = data; int ret; - ret = cifs_writepage_locked(page, wbc); - unlock_page(page); + ret = cifs_writepage_locked(&folio->page, wbc); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/coredump.c b/fs/coredump.c index 68619329ec652..3e34cb44c42f7 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1111,14 +1111,14 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, +static struct vm_area_struct *coredump_next_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { if (gate_vma && (vma == gate_vma)) return NULL; - vma = mas_next(mas, ULONG_MAX); + vma = vma_next(vmi); if (vma) return vma; return gate_vma; @@ -1146,7 +1146,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); int i = 0; /* @@ -1167,7 +1167,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { + while ((vma = coredump_next_vma(&vmi, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index d98cef0dbb6bb..4612c9bbf1022 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -38,7 +38,7 @@ config CRAMFS_MTD default y if !CRAMFS_BLOCKDEV help This option allows the CramFs driver to load data directly from - a linear adressed memory range (usually non volatile memory + a linear addressed memory range (usually non-volatile memory like flash) instead of going through the block device layer. This saves some memory since no intermediate buffering is necessary. diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 429696f18b6c8..e3d168911dbe1 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -407,7 +407,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) * unpopulated ptes via cramfs_read_folio(). */ int i; - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 2713257ee7180..ba815a210fd96 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -426,7 +426,7 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &erofs_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } #else diff --git a/fs/exec.c b/fs/exec.c index b0598f0ff4b03..d40e019a5ac55 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -270,7 +270,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); @@ -699,7 +699,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) /* * cover the whole range: [new_start, old_end) */ - if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* @@ -731,12 +731,9 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) } tlb_finish_mmu(&tlb); - /* - * Shrink the vma to just the new range. Always succeeds. - */ - vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); - - return 0; + vma_prev(&vmi); + /* Shrink the vma to just the new range */ + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* @@ -758,6 +755,7 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; + struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ @@ -812,8 +810,10 @@ int setup_arg_pages(struct linux_binprm *bprm, vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; + vma_iter_init(&vmi, mm, vma->vm_start); + tlb_gather_mmu(&tlb, mm); - ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end, + ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); @@ -834,7 +834,7 @@ int setup_arg_pages(struct linux_binprm *bprm, } /* mprotect_fixup is overkill to remove the temporary stack flags */ - vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7ac0a81bd3713..6bdf61a62c796 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -801,7 +801,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); } else { vma->vm_ops = &ext4_file_vm_ops; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b936ee3af51e2..40579ef513b71 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -786,11 +786,10 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ + old_state = READ_ONCE(bh->b_state); do { - old_state = READ_ONCE(bh->b_state); new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; - } while (unlikely( - cmpxchg(&bh->b_state, old_state, new_state) != old_state)); + } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, @@ -2596,8 +2595,8 @@ static bool ext4_page_nomap_can_writeout(struct page *page) static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct pagevec pvec; - unsigned int nr_pages; + struct folio_batch fbatch; + unsigned int nr_folios; long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; @@ -2611,18 +2610,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - - pagevec_init(&pvec); + folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply @@ -2636,10 +2634,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; /* If we can't merge this page, we are done. */ - if (mpd->map.m_len > 0 && mpd->next_page != page->index) + if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; - lock_page(page); + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which @@ -2647,16 +2645,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ - if (!PageDirty(page) || - (PageWriteback(page) && + if (!folio_test_dirty(folio) || + (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || - unlikely(page->mapping != mapping)) { - unlock_page(page); + unlikely(folio->mapping != mapping)) { + folio_unlock(folio); continue; } - wait_on_page_writeback(page); - BUG_ON(PageWriteback(page)); + folio_wait_writeback(folio); + BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in @@ -2667,56 +2665,56 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ - if (!page_has_buffers(page)) { - ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); + if (!folio_buffers(folio)) { + ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); + folio_clear_dirty(folio); + folio_unlock(folio); continue; } if (mpd->map.m_len == 0) - mpd->first_page = page->index; - mpd->next_page = page->index + 1; + mpd->first_page = folio->index; + mpd->next_page = folio->index + folio_nr_pages(folio); /* * Writeout for transaction commit where we cannot * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(page)) { - err = mpage_submit_page(mpd, page); + if (ext4_page_nomap_can_writeout(&folio->page)) { + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } else { - unlock_page(page); - mpd->first_page++; + folio_unlock(folio); + mpd->first_page += folio_nr_pages(folio); } } else { /* Add all dirty buffers to mpd */ - lblk = ((ext4_lblk_t)page->index) << + lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); - head = page_buffers(page); + head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, - lblk); + lblk); if (err <= 0) goto out; err = 0; } - left--; + left -= folio_nr_pages(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; return 0; out: - pagevec_release(&pvec); + folio_batch_release(&fbatch); return err; } -static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, +static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { - return ext4_writepage(page, wbc); + return ext4_writepage(&folio->page, wbc); } static int ext4_do_writepages(struct mpage_da_data *mpd) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b31db521d6bf3..b8aaf55dff342 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -482,7 +482,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) * * However, we may have to redirty a page (see below.) */ -static int ext4_journalled_writepage_callback(struct page *page, +static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { @@ -490,7 +490,7 @@ static int ext4_journalled_writepage_callback(struct page *page, struct buffer_head *bh, *head; struct journal_head *jh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: @@ -509,7 +509,7 @@ static int ext4_journalled_writepage_callback(struct page *page, if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5c481824a4c63..b1b39c17f2d82 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -393,59 +393,62 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; - struct pagevec pvec; + struct folio_batch fbatch; long nwritten = 0; - int nr_pages; + int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; - pagevec_init(&pvec); + folio_batch_init(&fbatch); blk_start_plug(&plug); - while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(mapping, &index, + (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - if (prev == ULONG_MAX) - prev = page->index - 1; - if (nr_to_write != LONG_MAX && page->index != prev + 1) { - pagevec_release(&pvec); + if (nr_to_write != LONG_MAX && i != 0 && + folio->index != prev + + folio_nr_pages(fbatch.folios[i-1])) { + folio_batch_release(&fbatch); goto stop; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true, true); + f2fs_wait_on_page_writeback(&folio->page, META, + true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - if (__f2fs_write_meta_page(page, &wbc, io_type)) { - unlock_page(page); + if (__f2fs_write_meta_page(&folio->page, &wbc, + io_type)) { + folio_unlock(folio); break; } - nwritten++; - prev = page->index; + nwritten += folio_nr_pages(folio); + prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } stop: @@ -1400,7 +1403,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, }; /* - * pagevec_lookup_tag and lock_page again will take + * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bc9afc2d421ee..2c8360fc08452 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2951,6 +2951,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0, retry = 0; struct page *pages[F2FS_ONSTACK_PAGES]; + struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; @@ -2971,6 +2972,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .private = NULL, }; #endif + int nr_folios, p, idx; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ @@ -2981,6 +2983,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; + folio_batch_init(&fbatch); + if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); @@ -3006,13 +3010,38 @@ static int f2fs_write_cache_pages(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { - nr_pages = find_get_pages_range_tag(mapping, &index, end, - tag, F2FS_ONSTACK_PAGES, pages); - if (nr_pages == 0) + nr_pages = 0; +again: + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) { + if (nr_pages) + goto write; break; + } + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + idx = 0; + p = folio_nr_pages(folio); +add_more: + pages[nr_pages] = folio_page(folio, idx); + folio_get(folio); + if (++nr_pages == F2FS_ONSTACK_PAGES) { + index = folio->index + idx + 1; + folio_batch_release(&fbatch); + goto write; + } + if (++idx < p) + goto add_more; + } + folio_batch_release(&fbatch); + goto again; +write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; + struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; @@ -3029,7 +3058,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } if (!f2fs_cluster_can_merge_page(&cc, - page->index)) { + folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) @@ -3038,27 +3067,28 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } if (unlikely(f2fs_cp_error(sbi))) - goto lock_page; + goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) - goto lock_page; + goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) - goto lock_page; + goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, - page->index, &fsdata); + folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, - fsdata, page->index, 1) || + fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, - pages, i, nr_pages, false))) { + pages, i, nr_pages, + false))) { retry = 1; break; } @@ -3071,46 +3101,47 @@ static int f2fs_write_cache_pages(struct address_space *mapping, break; } #ifdef CONFIG_F2FS_FS_COMPRESSION -lock_page: +lock_folio: #endif - done_index = page->index; + done_index = folio->index; retry_write: - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, + f2fs_wait_on_page_writeback( + &folio->page, DATA, true, true); else goto continue_unlock; } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { - get_page(page); - f2fs_compress_ctx_add_page(&cc, page); + folio_get(folio); + f2fs_compress_ctx_add_page(&cc, &folio->page); continue; } #endif - ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, - 0, true); + ret = f2fs_write_single_data_page(&folio->page, + &submitted, &bio, &last_block, + wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif @@ -3134,7 +3165,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } goto next; } - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19a1fee88a363..bd1dad5237967 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1518,23 +1518,24 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; struct page *last_page = NULL; - int nr_pages; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); return ERR_PTR(-EIO); } @@ -1565,7 +1566,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) last_page = page; unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } return last_page; @@ -1730,12 +1731,12 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nr_pages; + int nr_folios; int nwritten = 0; if (atomic) { @@ -1744,20 +1745,21 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec); + folio_batch_init(&fbatch); index = 0; - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); - pagevec_release(&pvec); + folio_batch_release(&fbatch); ret = -EIO; goto out; } @@ -1823,7 +1825,7 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (ret || marked) @@ -1888,17 +1890,18 @@ static bool flush_dirty_inode(struct page *page) void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) { pgoff_t index = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while ((nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while ((nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; if (!IS_DNODE(page)) continue; @@ -1925,7 +1928,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) } unlock_page(page); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -1935,23 +1938,24 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, bool do_balance, enum iostat_type io_type) { pgoff_t index; - struct pagevec pvec; + struct folio_batch fbatch; int step = 0; int nwritten = 0; int ret = 0; - int nr_pages, done = 0; + int nr_folios, done = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); next_step: index = 0; - while (!done && (nr_pages = pagevec_lookup_tag(&pvec, - NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) { + while (!done && (nr_folios = filemap_get_folios_tag(NODE_MAPPING(sbi), + &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, + &fbatch))) { int i; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct page *page = &fbatch.folios[i]->page; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2026,7 +2030,7 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, if (--wbc->nr_to_write == 0) break; } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (wbc->nr_to_write == 0) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index fceda1de48055..c4d00999a4330 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -200,7 +200,7 @@ static const struct dentry_operations vfat_dentry_ops = { /* Characters that are undesirable in an MS-DOS file name */ -static inline wchar_t vfat_bad_char(wchar_t w) +static inline bool vfat_bad_char(wchar_t w) { return (w < 0x0020) || (w == '*') || (w == '?') || (w == '<') || (w == '>') @@ -208,7 +208,7 @@ static inline wchar_t vfat_bad_char(wchar_t w) || (w == '\\'); } -static inline wchar_t vfat_replace_char(wchar_t w) +static inline bool vfat_replace_char(wchar_t w) { return (w == '[') || (w == ']') || (w == ';') || (w == ',') || (w == '+') || (w == '='); diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index c99282df7761a..f439877ea6e80 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -31,7 +31,7 @@ vxfs_put_page(struct page *pp) /** * vxfs_get_page - read a page into memory. - * @ip: inode to read from + * @mapping: mapping to read from * @n: page number * * Description: @@ -81,14 +81,14 @@ vxfs_bread(struct inode *ip, int block) } /** - * vxfs_get_block - locate buffer for given inode,block tuple + * vxfs_getblk - locate buffer for given inode,block tuple * @ip: inode * @iblock: logical block * @bp: buffer skeleton * @create: %TRUE if blocks may be newly allocated. * * Description: - * The vxfs_get_block function fills @bp with the right physical + * The vxfs_getblk function fills @bp with the right physical * block and device number to perform a lowlevel read/write on * it. * diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index c3b82f716f9a7..310d73e254df2 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -165,7 +165,7 @@ static int vxfs_try_sb_magic(struct super_block *sbp, int silent, } /** - * vxfs_read_super - read superblock into memory and initialize filesystem + * vxfs_fill_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6fba5a52127b2..195dc23e0d831 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -237,7 +237,7 @@ void wb_wait_for_completion(struct wb_completion *done) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; -void __inode_attach_wb(struct inode *inode, struct page *page) +void __inode_attach_wb(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; @@ -245,8 +245,8 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (page) { - memcg_css = mem_cgroup_css_from_page(page); + if (folio) { + memcg_css = mem_cgroup_css_from_folio(folio); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ @@ -859,6 +859,7 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -871,7 +872,8 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_page(page); + folio = page_folio(page); + css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e23e802a80130..8e74f278a3f69 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -860,7 +860,7 @@ int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_ops = &fuse_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE); return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 168a99e971934..73fc2479d0ca1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2223,7 +2223,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return false; } -static int fuse_writepages_fill(struct page *page, +static int fuse_writepages_fill(struct folio *folio, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; @@ -2242,7 +2242,7 @@ static int fuse_writepages_fill(struct page *page, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } @@ -2277,7 +2277,7 @@ static int fuse_writepages_fill(struct page *page, data->max_pages = 1; ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; @@ -2285,13 +2285,13 @@ static int fuse_writepages_fill(struct page *page, ap->num_pages = 0; wpa->inode = inode; } - set_page_writeback(page); + folio_start_writeback(folio); - copy_highpage(tmp_page, page); + copy_highpage(tmp_page, &folio->page); ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = page; + data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); @@ -2305,13 +2305,13 @@ static int fuse_writepages_fill(struct page *page, spin_lock(&fi->lock); ap->num_pages++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, page)) { + } else if (fuse_writepage_add(wpa, &folio->page)) { data->wpa = wpa; } else { - end_page_writeback(page); + folio_end_writeback(folio); } out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index e782b4f1d1043..0a47068f9acce 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -195,67 +195,71 @@ static int gfs2_writepages(struct address_space *mapping, } /** - * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * gfs2_write_jdata_batch - Write back a folio batch's worth of folios * @mapping: The mapping * @wbc: The writeback control - * @pvec: The vector of pages - * @nr_pages: The number of pages to write + * @fbatch: The batch of folios * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ -static int gfs2_write_jdata_pagevec(struct address_space *mapping, +static int gfs2_write_jdata_batch(struct address_space *mapping, struct writeback_control *wbc, - struct pagevec *pvec, - int nr_pages, + struct folio_batch *fbatch, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + unsigned nrblocks; int i; int ret; + int nr_pages = 0; + int nr_folios = folio_batch_count(fbatch); + + for (i = 0; i < nr_folios; i++) + nr_pages += folio_nr_pages(fbatch->folios[i]); + nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; - for(i = 0; i < nr_pages; i++) { - struct page *page = pvec->pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch->folios[i]; - *done_index = page->index; + *done_index = folio->index; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(page, wbc); + ret = __gfs2_jdata_writepage(&folio->page, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); ret = 0; } else { @@ -268,7 +272,8 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, * not be suitable for data integrity * writeout). */ - *done_index = page->index + 1; + *done_index = folio->index + + folio_nr_pages(folio); ret = 1; break; } @@ -305,8 +310,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, { int ret = 0; int done = 0; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t writeback_index; pgoff_t index; pgoff_t end; @@ -315,7 +320,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -341,17 +346,18 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) break; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, &done_index); + ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, + &done_index); if (ret) done = 1; if (ret > 0) ret = 0; - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 25fd21a9dec80..d750d1128bed7 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -80,11 +80,11 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) brelse(bd->bd_bh); } -static int __gfs2_writepage(struct page *page, struct writeback_control *wbc, +static int __gfs2_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); + int ret = mapping->a_ops->writepage(&folio->page, wbc); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 2015e42e752a6..6add6ebfef896 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -274,6 +274,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) tree->node_hash[hash] = node; tree->node_hash_cnt++; } else { + hfs_bnode_get(node2); spin_unlock(&tree->hash_lock); kfree(node); wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 3f7e9bef98743..6d1878b99b305 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -486,7 +486,7 @@ void hfs_file_truncate(struct inode *inode) inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; - void *fsdata; + void *fsdata = NULL; struct page *page; /* XXX: Can use generic_cont_expand? */ diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 721f779b4ec3e..7a542f3dbe502 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -554,7 +554,7 @@ void hfsplus_file_truncate(struct inode *inode) if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; struct page *page; - void *fsdata; + void *fsdata = NULL; loff_t size = inode->i_size; res = hfsplus_write_begin(NULL, mapping, size, 0, diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 5b476f57eb173..58021e73c00bf 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -257,7 +257,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int err = 0; + int err; struct hfs_find_data cat_fd; hfsplus_cat_entry entry; u16 cat_entry_flags, cat_entry_type; @@ -494,7 +494,7 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, __be32 xattr_record_type; u32 record_type; u16 record_length = 0; - ssize_t res = 0; + ssize_t res; if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || @@ -606,7 +606,7 @@ static inline int can_list(const char *xattr_name) static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { - ssize_t res = 0; + ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; @@ -674,10 +674,9 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; - ssize_t res = 0; + ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; - u16 key_len = 0; struct hfsplus_attr_key attr_key; char *strbuf; int xattr_name_len; @@ -719,7 +718,8 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) } for (;;) { - key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + u16 key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); + if (key_len == 0 || key_len > fd.tree->max_key_len) { pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; @@ -766,12 +766,12 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) static int hfsplus_removexattr(struct inode *inode, const char *name) { - int err = 0; + int err; struct hfs_find_data cat_fd; u16 flags; u16 cat_entry_type; - int is_xattr_acl_deleted = 0; - int is_all_xattrs_deleted = 0; + int is_xattr_acl_deleted; + int is_all_xattrs_deleted; if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7038e47148fc0..9062da6da5675 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -132,7 +132,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_future_write(info->seals, vma); @@ -811,7 +811,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * as input to create an allocation policy. */ vma_init(&pseudo_vma, mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); + vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { @@ -819,8 +819,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ - struct page *page; + struct folio *folio; unsigned long addr; + bool present; cond_resched(); @@ -844,48 +845,49 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - page = find_get_page(mapping, index); - if (page) { - put_page(page); + rcu_read_lock(); + present = page_cache_next_miss(mapping, index, 1) != index; + rcu_read_unlock(); + if (present) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); hugetlb_drop_vma_policy(&pseudo_vma); continue; } /* - * Allocate page without setting the avoid_reserve argument. + * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate - * pages in these areas, we need to consume the reserves + * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - page = alloc_huge_page(&pseudo_vma, addr, 0); + folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); - if (IS_ERR(page)) { + if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - error = PTR_ERR(page); + error = PTR_ERR(folio); goto out; } - clear_huge_page(page, addr, pages_per_huge_page(h)); - __SetPageUptodate(page); - error = hugetlb_add_to_page_cache(page, mapping, index); + clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { - restore_reserve_on_error(h, &pseudo_vma, addr, page); - put_page(page); + restore_reserve_on_error(h, &pseudo_vma, addr, folio); + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); /* - * unlock_page because locked by hugetlb_add_to_page_cache() - * put_page() due to reference from alloc_huge_page() + * folio_unlock because locked by hugetlb_add_to_page_cache() + * folio_put() due to reference from alloc_hugetlb_folio() */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d3c300563eb8f..6f4c97a6d7e9d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1714,10 +1714,9 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ -static int -iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) +static int iomap_do_writepage(struct folio *folio, + struct writeback_control *wbc, void *data) { - struct folio *folio = page_folio(page); struct iomap_writepage_ctx *wpc = data; struct inode *inode = folio->mapping->host; u64 end_pos, isize; diff --git a/fs/mpage.c b/fs/mpage.c index 9f7cf06caae21..22b9de5ddd684 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -269,11 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) alloc_new: if (args->bio == NULL) { - if (first_hole == blocks_per_page) { - if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), - &folio->page)) - goto out; - } args->bio = bio_alloc(bdev, bio_max_segs(args->nr_pages), opf, gfp); if (args->bio == NULL) @@ -445,15 +440,14 @@ void clean_page_buffers(struct page *page) clean_buffers(page, ~0U); } -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, +static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct mpage_data *mpd = data; struct bio *bio = mpd->bio; - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - unsigned long end_index; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; sector_t last_block; sector_t block_in_file; @@ -464,13 +458,13 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, int boundary = 0; sector_t boundary_block = 0; struct block_device *boundary_bdev = NULL; - int length; + size_t length; struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + struct buffer_head *head = folio_buffers(folio); - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); + if (head) { struct buffer_head *bh = head; /* If they're all mapped and dirty, do it */ @@ -522,8 +516,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, /* * The page has no buffers: map it to disk */ - BUG_ON(!PageUptodate(page)); - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); + BUG_ON(!folio_test_uptodate(folio)); + block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -531,7 +525,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (block_in_file >= (i_size + (1 << blkbits) - 1) >> blkbits) goto page_is_mapped; last_block = (i_size - 1) >> blkbits; - map_bh.b_page = page; + map_bh.b_folio = folio; for (page_block = 0; page_block < blocks_per_page; ) { map_bh.b_state = 0; @@ -562,8 +556,11 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, first_unmapped = page_block; page_is_mapped: - end_index = i_size >> PAGE_SHIFT; - if (page->index >= end_index) { + /* Don't bother writing beyond EOF, truncate will discard the folio */ + if (folio_pos(folio) >= i_size) + goto confused; + length = folio_size(folio); + if (folio_pos(folio) + length > i_size) { /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. @@ -572,11 +569,8 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * is zeroed when mapped, and writes to that region are not * written out to the file." */ - unsigned offset = i_size & (PAGE_SIZE - 1); - - if (page->index > end_index || !offset) - goto confused; - zero_user_segment(page, offset, PAGE_SIZE); + length = i_size - folio_pos(folio); + folio_zero_segment(folio, length, folio_size(folio)); } /* @@ -587,11 +581,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, alloc_new: if (bio == NULL) { - if (first_unmapped == blocks_per_page) { - if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), - page, wbc)) - goto out; - } bio = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS); @@ -604,18 +593,18 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); length = first_unmapped << blkbits; - if (bio_add_page(bio, page, length, 0) < length) { + if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit(bio); goto alloc_new; } - clean_buffers(page, first_unmapped); + clean_buffers(&folio->page, first_unmapped); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); if (boundary || (first_unmapped != blocks_per_page)) { bio = mpage_bio_submit(bio); if (boundary_block) { @@ -634,7 +623,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, /* * The caller has a ref on the inode, so *mapping is stable */ - ret = block_write_full_page(page, mpd->get_block, wbc); + ret = block_write_full_page(&folio->page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1a80d548253ac..ba3799097d1a5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -690,13 +690,14 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(&folio->page, wbc, data); if (ret != AOP_WRITEPAGE_ACTIVATE) - unlock_page(page); + folio_unlock(folio); return ret; } diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b5f997e5e6708..2681a449edc16 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2150,7 +2150,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; pgoff_t index = 0; int level, i; @@ -2160,19 +2160,19 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, btcache, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btcache, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh)) nilfs_btree_add_dirty_buffer(btree, lists, bh); } while ((bh = bh->b_this_page) != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 9930fa901039f..9cf6ba58f5859 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat) static int nilfs_dat_prepare_entry(struct inode *dat, struct nilfs_palloc_req *req, int create) { - return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, - create, &req->pr_entry_bh); + int ret; + + ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); + if (unlikely(ret == -ENOENT)) { + nilfs_err(dat->i_sb, + "DAT doesn't have a block to manage vblocknr = %llu", + (unsigned long long)req->pr_entry_nr); + /* + * Return internal code -EINVAL to notify bmap layer of + * metadata corruption. + */ + ret = -EINVAL; + } + return ret; } static void nilfs_dat_commit_entry(struct inode *dat, @@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct inode *dat, int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { - int ret; - - ret = nilfs_dat_prepare_entry(dat, req, 0); - WARN_ON(ret == -ENOENT); - return ret; + return nilfs_dat_prepare_entry(dat, req, 0); } void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, @@ -149,19 +158,19 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; + __u64 start; sector_t blocknr; void *kaddr; int ret; ret = nilfs_dat_prepare_entry(dat, req, 0); - if (ret < 0) { - WARN_ON(ret == -ENOENT); + if (ret < 0) return ret; - } kaddr = kmap_atomic(req->pr_entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); blocknr = le64_to_cpu(entry->de_blocknr); kunmap_atomic(kaddr); @@ -172,6 +181,15 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) return ret; } } + if (unlikely(start > nilfs_mdt_cno(dat))) { + nilfs_err(dat->i_sb, + "vblocknr = %llu has abnormal lifetime: start cno (= %llu) > current cno (= %llu)", + (unsigned long long)req->pr_entry_nr, + (unsigned long long)start, + (unsigned long long)nilfs_mdt_cno(dat)); + nilfs_dat_abort_entry(dat, req); + return -EINVAL; + } return 0; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 39b7eea2642a0..41ccd43cd9797 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -240,42 +240,43 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) int nilfs_copy_dirty_pages(struct address_space *dmap, struct address_space *smap) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; int err = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) + if (!filemap_get_folios_tag(smap, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) return 0; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i], *dpage; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i], *dfolio; - lock_page(page); - if (unlikely(!PageDirty(page))) - NILFS_PAGE_BUG(page, "inconsistent dirty state"); + folio_lock(folio); + if (unlikely(!folio_test_dirty(folio))) + NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state"); - dpage = grab_cache_page(dmap, page->index); - if (unlikely(!dpage)) { + dfolio = filemap_grab_folio(dmap, folio->index); + if (unlikely(!dfolio)) { /* No empty page is added to the page cache */ err = -ENOMEM; - unlock_page(page); + folio_unlock(folio); break; } - if (unlikely(!page_has_buffers(page))) - NILFS_PAGE_BUG(page, + if (unlikely(!folio_buffers(folio))) + NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(dpage, page, 1); - __set_page_dirty_nobuffers(dpage); + nilfs_copy_page(&dfolio->page, &folio->page, 1); + filemap_dirty_folio(folio_mapping(dfolio), dfolio); - unlock_page(dpage); - put_page(dpage); - unlock_page(page); + folio_unlock(dfolio); + folio_put(dfolio); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); if (likely(!err)) @@ -357,22 +358,22 @@ void nilfs_copy_back_pages(struct address_space *dmap, */ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) { - struct pagevec pvec; + struct folio_batch fbatch; unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); - while (pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, + PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; - lock_page(page); - nilfs_clear_dirty_page(page, silent); - unlock_page(page); + folio_lock(folio); + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f7a14ed12a669..19446a8243d79 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -680,7 +680,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t index = 0, last = ULONG_MAX; size_t ndirties = 0; int i; @@ -694,23 +694,26 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec); + folio_batch_init(&fbatch); repeat: if (unlikely(index > last) || - !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY)) + !filemap_get_folios_tag(mapping, &index, last, + PAGECACHE_TAG_DIRTY, &fbatch)) return ndirties; - for (i = 0; i < pagevec_count(&pvec); i++) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { struct buffer_head *bh, *head; - struct page *page = pvec.pages[i]; + struct folio *folio = fbatch.folios[i]; - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, i_blocksize(inode), 0); - unlock_page(page); + folio_lock(folio); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, i_blocksize(inode), 0); + head = folio_buffers(folio); + } + folio_unlock(folio); - bh = head = page_buffers(page); + bh = head; do { if (!buffer_dirty(bh) || buffer_async_write(bh)) continue; @@ -718,13 +721,13 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, list_add_tail(&bh->b_assoc_buffers, listp); ndirties++; if (unlikely(ndirties >= nlimit)) { - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); return ndirties; } } while (bh = bh->b_this_page, bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); goto repeat; } @@ -734,20 +737,19 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, { struct nilfs_inode_info *ii = NILFS_I(inode); struct inode *btnc_inode = ii->i_assoc_inode; - struct pagevec pvec; + struct folio_batch fbatch; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0; if (!btnc_inode) return; + folio_batch_init(&fbatch); - pagevec_init(&pvec); - - while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, - PAGECACHE_TAG_DIRTY)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - bh = head = page_buffers(pvec.pages[i]); + while (filemap_get_folios_tag(btnc_inode->i_mapping, &index, + (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + bh = head = folio_buffers(fbatch.folios[i]); do { if (buffer_dirty(bh) && !buffer_async_write(bh)) { @@ -758,7 +760,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, bh = bh->b_this_page; } while (bh != head); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9364d35b4a10e..e8aeba124a954 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * aops.c - NTFS kernel address space operations and page cache handling. * * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. @@ -1646,7 +1646,7 @@ static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) return block; } -/** +/* * ntfs_normal_aops - address space operations for normal inodes and attributes * * Note these are not used for compressed or mst protected inodes and @@ -1664,7 +1664,7 @@ const struct address_space_operations ntfs_normal_aops = { .error_remove_page = generic_error_remove_page, }; -/** +/* * ntfs_compressed_aops - address space operations for compressed inodes */ const struct address_space_operations ntfs_compressed_aops = { @@ -1678,9 +1678,9 @@ const struct address_space_operations ntfs_compressed_aops = { .error_remove_page = generic_error_remove_page, }; -/** +/* * ntfs_mst_aops - general address space operations for mst protecteed inodes - * and attributes + * and attributes */ const struct address_space_operations ntfs_mst_aops = { .read_folio = ntfs_read_folio, /* Fill page with data. */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h index 0cac5458c023e..8d0958a149cb2 100644 --- a/fs/ntfs/aops.h +++ b/fs/ntfs/aops.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/** +/* * aops.h - Defines for NTFS kernel address space operations and page cache * handling. Part of the Linux-NTFS project. * diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index 587e9b1878738..f9cb180b6f6be 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * compress.c - NTFS kernel compressed attributes handling. * Part of the Linux-NTFS project. * @@ -41,12 +41,12 @@ typedef enum { NTFS_MAX_CB_SIZE = 64 * 1024, } ntfs_compression_constants; -/** +/* * ntfs_compression_buffer - one buffer for the decompression engine */ static u8 *ntfs_compression_buffer; -/** +/* * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer */ static DEFINE_SPINLOCK(ntfs_cb_lock); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index cd96083a12c8e..518c3a21a556d 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov @@ -17,7 +17,7 @@ #include "debug.h" #include "ntfs.h" -/** +/* * The little endian Unicode string $I30 as a global constant. */ ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index e6fc5f7cb1d75..6c3f38d665794 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * inode.c - NTFS kernel inode handling. * * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. @@ -2935,7 +2935,7 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } /** - * ntfs_write_inode - write out a dirty inode + * __ntfs_write_inode - write out a dirty inode * @vi: inode to write out * @sync: if true, write out synchronously * @@ -3033,7 +3033,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) * might not need to be written out. * NOTE: It is not a problem when the inode for $MFT itself is being * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES - * on the $MFT inode and hence ntfs_write_inode() will not be + * on the $MFT inode and hence __ntfs_write_inode() will not be * re-invoked because of it which in turn is ok since the dirtied mft * record will be cleaned and written out to disk below, i.e. before * this function returns. diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index f7bf5ce960ccc..48030899dc6ec 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 4e6a44bc654ce..ab44f2db533be 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -259,7 +259,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, } } -/** +/* * Inode operations for directories. */ const struct inode_operations ntfs_dir_inode_ops = { @@ -364,7 +364,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, ntfs_nfs_get_inode); } -/** +/* * Export operations allowing NFS exporting of mounted NTFS partitions. * * We use the default ->encode_fh() for now. Note that they diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 97932fb5179cd..0d448e9881f71 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. * * Copyright (c) 2001-2007 Anton Altaparmakov diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 001f4e053c85a..2643a08182e18 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -58,9 +58,17 @@ const option_t on_errors_arr[] = { }; /** - * simple_getbool - + * simple_getbool - convert input string to a boolean value + * @s: input string to convert + * @setval: where to store the output boolean value * * Copied from old ntfs driver (which copied from vfat driver). + * + * "1", "yes", "true", or an empty string are converted to %true. + * "0", "no", and "false" are converted to %false. + * + * Return: %1 if the string is converted or was empty and *setval contains it; + * %0 if the string was not valid. */ static int simple_getbool(char *s, bool *setval) { @@ -2657,7 +2665,7 @@ static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) } #endif -/** +/* * The complete super operations. */ static const struct super_operations ntfs_sops = { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index bc4c9e35a6fc4..3d2e4c1270e49 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -844,7 +844,7 @@ int ntfs_set_size(struct inode *inode, u64 new_size) return err; } -static int ntfs_resident_writepage(struct page *page, +static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; @@ -852,11 +852,11 @@ static int ntfs_resident_writepage(struct page *page, int ret; ni_lock(ni); - ret = attr_data_write_resident(ni, page); + ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); if (ret != E_NTFS_NONRESIDENT) - unlock_page(page); + folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 4ecb91a9bbebe..1a4301a38aa7c 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -390,8 +390,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ - vma->vm_flags |= VM_SEQ_READ; - vma->vm_flags &= ~VM_RAND_READ; + vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ); file_accessed(file); vma->vm_ops = &orangefs_file_vm_ops; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 11e21a0e65ce4..c4c135f96aab2 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -154,21 +154,20 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, return ret; } -static int orangefs_writepages_callback(struct page *page, - struct writeback_control *wbc, void *data) +static int orangefs_writepages_callback(struct folio *folio, + struct writeback_control *wbc, void *data) { struct orangefs_writepages *ow = data; - struct orangefs_write_range *wr; + struct orangefs_write_range *wr = folio->private; int ret; - if (!PagePrivate(page)) { - unlock_page(page); + if (!wr) { + folio_unlock(folio); /* It's not private so there's nothing to write, right? */ printk("writepages_callback not private!\n"); BUG(); return 0; } - wr = (struct orangefs_write_range *)page_private(page); ret = -1; if (ow->npages == 0) { @@ -176,7 +175,7 @@ static int orangefs_writepages_callback(struct page *page, ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -188,7 +187,7 @@ static int orangefs_writepages_callback(struct page *page, } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = page; + ow->pages[ow->npages++] = &folio->page; ret = 0; goto done; } @@ -198,10 +197,10 @@ static int orangefs_writepages_callback(struct page *page, orangefs_writepages_work(ow, wbc); ow->npages = 0; } - ret = orangefs_writepage_locked(page, wbc); - mapping_set_error(page->mapping, ret); - unlock_page(page); - end_page_writeback(page); + ret = orangefs_writepage_locked(&folio->page, wbc); + mapping_set_error(folio->mapping, ret); + folio_unlock(folio); + folio_end_writeback(folio); } else { if (ow->npages == ow->maxpages) { orangefs_writepages_work(ow, wbc); diff --git a/fs/proc/base.c b/fs/proc/base.c index b2edb2a4124ba..c2cfeb57dc2a4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3211,6 +3211,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "zero_pages_sharing %lu\n", mm->ksm_zero_pages_sharing); mmput(mm); } diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 91fe1597af7b4..a6f76121955f6 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -17,6 +17,7 @@ static int __init proc_cmdline_init(void) struct proc_dir_entry *pde; pde = proc_create_single("cmdline", 0, NULL, cmdline_proc_show); + pde_make_permanent(pde); pde->size = saved_command_line_len + 1; return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index af1c49ae11b16..6a96e1713fd58 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -890,7 +890,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) @@ -908,7 +908,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); if (unlikely(!vma)) goto empty_set; @@ -923,7 +923,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { - mas_pause(&mas); + vma_iter_invalidate(&vmi); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -948,31 +948,31 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * * 1) VMA2 is freed, but VMA3 exists: * - * find_vma(mm, 16k - 1) will return VMA3. + * vma_next(vmi) will return VMA3. * In this case, just continue from VMA3. * * 2) VMA2 still exists: * - * find_vma(mm, 16k - 1) will return VMA2. - * Iterate the loop like the original one. + * vma_next(vmi) will return VMA3. + * In this case, just continue from VMA3. * * 3) No more VMAs can be found: * - * find_vma(mm, 16k - 1) will return NULL. + * vma_next(vmi) will return NULL. * No more things to do, just break. * * 4) (last_vma_end - 1) is the middle of a vma (VMA'): * - * find_vma(mm, 16k - 1) will return VMA' whose range + * vma_next(vmi) will return VMA' whose range * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = mas_find(&mas, ULONG_MAX); + vma = vma_next(&vmi); /* Case 3 above */ if (!vma) break; - /* Case 1 above */ + /* Case 1 and 2 above */ if (vma->vm_start >= last_vma_end) continue; @@ -980,8 +980,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) if (vma->vm_end > last_vma_end) smap_gather_stats(vma, &mss, last_vma_end); } - /* Case 2 above */ - } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + } for_each_vma(vmi, vma); empty_set: show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); @@ -1277,7 +1276,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1297,16 +1296,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_flags_clear(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } inc_tlb_flush_pending(mm); mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, - 0, NULL, mm, 0, -1UL); + 0, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 09a81e4b12738..12af614f33ce4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -582,8 +582,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_mod(vma, VM_MIXEDMAP, VM_MAYWRITE | VM_MAYEXEC); vma->vm_ops = &vmcore_mmap_ops; len = 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 3b2adf4cbc579..3a686dd7d2f4a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -185,16 +185,17 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) } } -static int udf_adinicb_writepage(struct page *page, +static int udf_adinicb_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); memcpy_to_page(page, 0, iinfo->i_data + iinfo->i_lenEAttr, i_size_read(inode)); - unlock_page(page); + folio_unlock(folio); mark_inode_dirty(inode); return 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15a5bf765d432..44d1ee429eb0c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -113,7 +113,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, { const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; - vma->vm_flags = flags; + vm_flags_reset(vma, flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply @@ -883,7 +883,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, mm, 0); WRITE_ONCE(ctx->released, true); @@ -900,7 +900,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -909,13 +909,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { - mas_pause(&mas); vma = prev; } else { prev = vma; @@ -1302,7 +1301,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1344,17 +1343,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (!mmget_not_zero(mm)) goto out; + ret = -EINVAL; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); if (!vma) goto out_unlock; - /* check that there's at least one vma in the range */ - ret = -EINVAL; - if (vma->vm_start >= end) - goto out_unlock; - /* * If the first vma contains huge pages, make sure start address * is aligned to huge page size. @@ -1371,7 +1366,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1428,16 +1424,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, basic_ioctls = true; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); @@ -1458,30 +1452,25 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } next: /* @@ -1498,8 +1487,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1543,7 +1532,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1562,14 +1551,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); - if (!vma) - goto out_unlock; - - /* check that there's at least one vma in the range */ ret = -EINVAL; - if (vma->vm_start >= end) + vma_iter_init(&vmi, mm, start); + vma = vma_find(&vmi, end); + if (!vma) goto out_unlock; /* @@ -1587,8 +1572,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - ret = -EINVAL; - for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { + cur = vma; + do { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1605,16 +1590,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out_unlock; found = true; - } + } for_each_vma_range(vmi, cur, end); BUG_ON(!found); - mas_set(&mas, start); - prev = mas_prev(&mas, 0); - if (prev != vma) - mas_next(&mas, ULONG_MAX); - + vma_iter_set(&vmi, start); + prev = vma_prev(&vmi); ret = 0; - do { + for_each_vma_range(vmi, vma, end) { cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1650,26 +1632,23 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(mm, vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(mm, prev, start, vma_end, new_flags, + prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { vma = prev; - mas_pause(&mas); goto next; } if (vma->vm_start < start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret) break; - mas_pause(&mas); } if (vma->vm_end > end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(&vmi, vma, end, 0); if (ret) break; - mas_pause(&mas); } next: /* @@ -1683,8 +1662,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = mas_next(&mas, end - 1); - } while (vma); + } + out_unlock: mmap_write_unlock(mm); mmput(mm); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d06c0cc62f612..705250f9f90a1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,7 +1429,7 @@ xfs_file_mmap( file_accessed(file); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vma->vm_flags |= VM_HUGEPAGE; + vm_flags_set(vma, VM_HUGEPAGE); return 0; } diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h index fbca56bd9cbc8..b05253f68eaa5 100644 --- a/include/asm-generic/error-injection.h +++ b/include/asm-generic/error-injection.h @@ -4,7 +4,6 @@ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) enum { - EI_ETYPE_NONE, /* Dummy value for undefined case */ EI_ETYPE_NULL, /* Return NULL if failure */ EI_ETYPE_ERRNO, /* Return -ERRNO if failure */ EI_ETYPE_ERRNO_NULL, /* Return -ERRNO or NULL if failure */ @@ -20,8 +19,10 @@ struct pt_regs; #ifdef CONFIG_FUNCTION_ERROR_INJECTION /* - * Whitelist generating macro. Specify functions which can be - * error-injectable using this macro. + * Whitelist generating macro. Specify functions which can be error-injectable + * using this macro. If you unsure what is required for the error-injectable + * functions, please read Documentation/fault-injection/fault-injection.rst + * 'Error Injectable Functions' section. */ #define ALLOW_ERROR_INJECTION(fname, _etype) \ static struct error_injection_entry __used \ diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index a2c8ed60233a4..6796abe1900e3 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,6 +19,18 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +#ifndef pfn_valid +static inline int pfn_valid(unsigned long pfn) +{ + /* avoid include hell */ + extern unsigned long max_mapnr; + unsigned long pfn_offset = ARCH_PFN_OFFSET; + + return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; +} +#define pfn_valid pfn_valid +#endif + #elif defined(CONFIG_SPARSEMEM_VMEMMAP) /* memmap is virtually contiguous. */ diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h index 6fc47561814c6..c0be2edeb4840 100644 --- a/include/asm-generic/page.h +++ b/include/asm-generic/page.h @@ -84,8 +84,6 @@ extern unsigned long memory_end; #define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) #endif -#define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && ((pfn) - ARCH_PFN_OFFSET) < max_mapnr) - #define virt_addr_valid(kaddr) (((void *)(kaddr) >= (void *)PAGE_OFFSET) && \ ((void *)(kaddr) < (void *)memory_end)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b9637d63e6f02..41a41561b7732 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -556,6 +556,7 @@ struct request_queue { #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ +#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ @@ -1253,6 +1254,12 @@ static inline bool bdev_nonrot(struct block_device *bdev) return blk_queue_nonrot(bdev_get_queue(bdev)); } +static inline bool bdev_synchronous(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_SYNCHRONOUS, + &bdev_get_queue(bdev)->queue_flags); +} + static inline bool bdev_stable_writes(struct block_device *bdev) { return test_bit(QUEUE_FLAG_STABLE_WRITES, @@ -1397,7 +1404,6 @@ struct block_device_operations { unsigned int flags); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, enum req_op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, @@ -1432,10 +1438,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int bdev_read_page(struct block_device *, sector_t, struct page *); -extern int bdev_write_page(struct block_device *, sector_t, struct page *, - struct writeback_control *); - static inline void blk_wake_io_task(struct task_struct *waiter) { /* diff --git a/include/linux/damon.h b/include/linux/damon.h index 7907918ad2e02..d5d4d19928e0a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,6 +91,12 @@ struct damon_target { * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions + * + * The support of each action is up to running &struct damon_operations. + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except + * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR + * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum + * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. */ enum damos_action { DAMOS_WILLNEED, @@ -221,6 +227,11 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @NR_DAMOS_FILTER_TYPES: Number of filter types. + * + * The support of each filter type is up to running &struct damon_operations. + * &enum DAMON_OPS_PADDR is supporting all filter types, while + * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any + * filter types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, @@ -343,10 +354,10 @@ struct damon_ctx; * users should register the low level operations for their target address * space and usecase via the &damon_ctx.ops. Then, the monitoring thread * (&damon_ctx.kdamond) calls @init and @prepare_access_checks before starting - * the monitoring, @update after each &damon_ctx.ops_update_interval, and + * the monitoring, @update after each &damon_attrs.ops_update_interval, and * @check_accesses, @target_valid and @prepare_access_checks after each - * &damon_ctx.sample_interval. Finally, @reset_aggregated is called after each - * &damon_ctx.aggr_interval. + * &damon_attrs.sample_interval. Finally, @reset_aggregated is called after + * each &damon_attrs.aggr_interval. * * Each &struct damon_operations instance having valid @id can be registered * via damon_register_ops() and selected by damon_select_ops() later. diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h index 635a95caf29f3..20e738f4eae8a 100644 --- a/include/linux/error-injection.h +++ b/include/linux/error-injection.h @@ -3,6 +3,7 @@ #define _LINUX_ERROR_INJECTION_H #include +#include #include #ifdef CONFIG_FUNCTION_ERROR_INJECTION @@ -19,7 +20,7 @@ static inline bool within_error_injection_list(unsigned long addr) static inline int get_injectable_error_type(unsigned long addr) { - return EI_ETYPE_NONE; + return -EOPNOTSUPP; } #endif diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e17..5088637fe5c2d 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d7097b8158f20..348701dae77fd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -207,31 +207,30 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE +#ifndef vma_alloc_zeroed_movable_folio /** - * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move - * @vma: The VMA the page is to be allocated for - * @vaddr: The virtual address the page will be inserted into + * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. + * @vma: The VMA the page is to be allocated for. + * @vaddr: The virtual address the page will be inserted into. * - * Returns: The allocated and zeroed HIGHMEM page + * This function will allocate a page suitable for inserting into this + * VMA at this virtual address. It may be allocated from highmem or + * the movable zone. An architecture may provide its own implementation. * - * This function will allocate a page for a VMA that the caller knows will - * be able to migrate in the future using move_pages() or reclaimed - * - * An architecture may override this function by defining - * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own - * implementation. + * Return: A folio containing one allocated and zeroed page or NULL if + * we are out of memory. */ -static inline struct page * -alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, +static inline +struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + struct folio *folio; - if (page) - clear_user_highpage(page, vaddr); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + if (folio) + clear_user_highpage(&folio->page, vaddr); - return page; + return folio; } #endif @@ -414,6 +413,35 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +/** + * memcpy_from_file_folio - Copy some bytes from a file folio. + * @to: The destination buffer. + * @folio: The folio to copy from. + * @pos: The position in the file. + * @len: The maximum number of bytes to copy. + * + * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE + * if the folio comes from HIGHMEM, and by the size of the folio. + * + * Return: The number of bytes copied from the folio. + */ +static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, + loff_t pos, size_t len) +{ + size_t offset = offset_in_folio(folio, pos); + char *from = kmap_local_folio(folio, offset); + + if (folio_test_highmem(folio)) + len = min_t(size_t, len, PAGE_SIZE - offset); + else + len = min(len, folio_size(folio) - offset); + + memcpy(to, from, len); + kunmap_local(from); + + return len; +} + /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a1341fdcf666d..70bd867eba949 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -187,7 +187,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } -void deferred_split_huge_page(struct page *page); +void deferred_split_folio(struct folio *folio); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -293,15 +293,6 @@ static inline bool thp_migration_supported(void) return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } -static inline struct list_head *page_deferred_list(struct page *page) -{ - /* - * See organization of tail pages of compound page in - * "struct page" definition. - */ - return &page[2].deferred_list; -} - #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -349,7 +340,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_huge_page(struct page *page) {} +static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d6413c3b8f5d..df6dd624ccfe8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,11 +171,11 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int isolate_hugetlb(struct folio *folio, struct list_head *list); +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void putback_active_hugepage(struct page *page); +void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -413,12 +413,12 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline int isolate_hugetlb(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct folio *folio, struct list_head *list) { return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { return 0; } @@ -429,7 +429,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void putback_active_hugepage(struct page *page) +static inline void folio_putback_active_hugetlb(struct folio *folio) { } @@ -717,16 +717,16 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page); + unsigned long address, struct folio *folio); /* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); @@ -878,9 +878,9 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_FAILURE -extern void hugetlb_clear_page_hwpoison(struct page *hpage); +extern void folio_clear_hugetlb_hwpoison(struct folio *folio); #else -static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +static inline void folio_clear_hugetlb_hwpoison(struct folio *folio) { } #endif @@ -1033,21 +1033,21 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } -static inline struct page *alloc_huge_page(struct vm_area_struct *vma, +static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { return NULL; } -static inline struct page * -alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +static inline struct folio * +alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { return NULL; } -static inline struct page *alloc_huge_page_vma(struct hstate *h, +static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f706626a80635..3d82d91f49acb 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -141,10 +141,10 @@ extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page); + struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, @@ -230,14 +230,14 @@ static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { } diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 5dd4343c1bbe5..6883c59227015 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -403,7 +403,8 @@ extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; -extern int kexec_load_disabled; + +bool kexec_load_permitted(int kexec_image_type); #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 815a276615178..1fadb5f5978b6 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -432,6 +432,7 @@ struct ma_wr_state { .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ + .mas_flags = 0, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ @@ -455,7 +456,7 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); @@ -470,6 +471,16 @@ void *mas_next(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); +static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, + unsigned long addr) +{ + memset(mas, 0, sizeof(struct ma_state)); + mas->tree = tree; + mas->index = mas->last = addr; + mas->max = ULONG_MAX; + mas->node = MAS_START; +} + /* Checks if a mas has not found anything */ static inline bool mas_is_none(struct ma_state *mas) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e605fc885f084..35478695cabf8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -890,7 +890,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3ef77f52a4f04..bdff950a8bb45 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -122,6 +122,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *folio_movable_ops(struct folio *folio) +{ + VM_BUG_ON(!__folio_test_movable(folio)); + + return (const struct movable_operations *) + ((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE); +} + static inline const struct movable_operations *page_movable_ops(struct page *page) { diff --git a/include/linux/mm.h b/include/linux/mm.h index c5af27ce23dfc..d6f8f41514cc9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -427,8 +427,8 @@ extern unsigned int kobjsize(const void *objp); /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE -/* This mask is used to clear all the VMA flags used by mlock */ -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR @@ -633,6 +633,56 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); } +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + ACCESS_PRIVATE(vma, __vm_flags) = flags; +} + +/* Use when VMA is part of the VMA tree and modifications need coordination */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + mmap_assert_write_locked(vma->vm_mm); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; +} + +/* + * Use only if VMA is not part of the VMA tree or has no other users and + * therefore needs no locking. + */ +static inline void __vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + vm_flags_init(vma, (vma->vm_flags | set) & ~clear); +} + +/* + * Use only when the order of set/clear operations is unimportant, otherwise + * use vm_flags_{set|clear} explicitly. + */ +static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) +{ + mmap_assert_write_locked(vma->vm_mm); + __vm_flags_mod(vma, set, clear); +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; @@ -676,16 +726,16 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { - return mas_find(&vmi->mas, max); + return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* - * Uses vma_find() to get the first VMA when the iterator starts. + * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ - return vma_find(vmi, ULONG_MAX); + return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) @@ -698,12 +748,50 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) return vmi->mas.index; } +static inline unsigned long vma_iter_end(struct vma_iterator *vmi) +{ + return vmi->mas.last + 1; +} +static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, + unsigned long count) +{ + return mas_expected_entries(&vmi->mas, count); +} + +/* Free any unused preallocations */ +static inline void vma_iter_free(struct vma_iterator *vmi) +{ + mas_destroy(&vmi->mas); +} + +static inline int vma_iter_bulk_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store(&vmi->mas, vma); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +static inline void vma_iter_invalidate(struct vma_iterator *vmi) +{ + mas_pause(&vmi->mas); +} + +static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) +{ + mas_set(&vmi->mas, addr); +} + #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ - while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* @@ -725,11 +813,20 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; +/* + * compound_order() can be called without holding a reference, which means + * that niceties like page_folio() don't work. These callers should be + * prepared to handle wild return values. For example, PG_head may be + * set before _folio_order is initialised, or this may be a tail page. + * See compaction.c for some good examples. + */ static inline unsigned int compound_order(struct page *page) { - if (!PageHead(page)) + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) return 0; - return page[1].compound_order; + return folio->_folio_order; } /** @@ -788,6 +885,13 @@ static inline bool get_page_unless_zero(struct page *page) return page_ref_add_unless(page, 1, 0); } +static inline struct folio *folio_get_nontail_page(struct page *page) +{ + if (unlikely(!get_page_unless_zero(page))) + return NULL; + return (struct folio *)page; +} + extern int page_is_ram(unsigned long pfn); enum { @@ -837,34 +941,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) static inline int folio_entire_mapcount(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - return atomic_read(folio_mapcount_ptr(folio)) + 1; -} - -/* - * Mapcount of compound page as a whole, does not include mapped sub-pages. - * Must be called only on head of compound page. - */ -static inline int head_compound_mapcount(struct page *head) -{ - return atomic_read(compound_mapcount_ptr(head)) + 1; -} - -/* - * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, - * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit - * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently - * leaves subpages_mapcount at 0, but avoid surprise if it participates later. - */ -#define COMPOUND_MAPPED 0x800000 -#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) - -/* - * Number of sub-pages mapped by PTE, does not include compound mapcount. - * Must be called only on head of compound page. - */ -static inline int head_subpages_mapcount(struct page *head) -{ - return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; + return atomic_read(&folio->_entire_mapcount) + 1; } /* @@ -877,25 +954,29 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/* - * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount of compound_head of page. +/** + * page_mapcount() - Number of times this precise page is mapped. + * @page: The page. + * + * The number of times this page is mapped. If this page is part of + * a large folio, it includes the number of times this page is mapped + * as part of that folio. * - * Result is undefined for pages which cannot be mapped into userspace. + * The result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). - * They use this place in struct page differently. + * They use this field in struct page differently. */ static inline int page_mapcount(struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; - if (likely(!PageCompound(page))) - return mapcount; - page = compound_head(page); - return head_compound_mapcount(page) + mapcount; + if (unlikely(PageCompound(page))) + mapcount += folio_entire_mapcount(page_folio(page)); + + return mapcount; } -int total_compound_mapcount(struct page *head); +int folio_total_mapcount(struct folio *folio); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -912,24 +993,24 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return total_compound_mapcount(&folio->page); + return folio_total_mapcount(folio); } static inline int total_mapcount(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) + 1; - return total_compound_mapcount(compound_head(page)); + return folio_total_mapcount(page_folio(page)); } static inline bool folio_large_is_mapped(struct folio *folio) { /* - * Reading folio_mapcount_ptr() below could be omitted if hugetlb - * participated in incrementing subpages_mapcount when compound mapped. + * Reading _entire_mapcount below could be omitted if hugetlb + * participated in incrementing nr_pages_mapped when compound mapped. */ - return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || - atomic_read(folio_mapcount_ptr(folio)) >= 0; + return atomic_read(&folio->_nr_pages_mapped) > 0 || + atomic_read(&folio->_entire_mapcount) >= 0; } /** @@ -1004,8 +1085,11 @@ extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) { + struct folio *folio = (struct folio *)page; + VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); - page[1].compound_dtor = compound_dtor; + VM_BUG_ON_PAGE(!PageHead(page), page); + folio->_folio_dtor = compound_dtor; } static inline void folio_set_compound_dtor(struct folio *folio, @@ -1017,28 +1101,13 @@ static inline void folio_set_compound_dtor(struct folio *folio, void destroy_large_folio(struct folio *folio); -static inline int head_compound_pincount(struct page *head) -{ - return atomic_read(compound_pincount_ptr(head)); -} - static inline void set_compound_order(struct page *page, unsigned int order) { - page[1].compound_order = order; -#ifdef CONFIG_64BIT - page[1].compound_nr = 1U << order; -#endif -} + struct folio *folio = (struct folio *)page; -/* Returns the number of pages in this potentially compound page. */ -static inline unsigned long compound_nr(struct page *page) -{ - if (!PageHead(page)) - return 1; + folio->_folio_order = order; #ifdef CONFIG_64BIT - return page[1].compound_nr; -#else - return 1UL << compound_order(page); + folio->_folio_nr_pages = 1U << order; #endif } @@ -1064,16 +1133,6 @@ static inline unsigned int thp_order(struct page *page) return compound_order(page); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline int thp_nr_pages(struct page *page) -{ - VM_BUG_ON_PGFLAGS(PageTail(page), page); - return compound_nr(page); -} - /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. @@ -1647,11 +1706,6 @@ static inline struct folio *pfn_folio(unsigned long pfn) return page_folio(pfn_to_page(pfn)); } -static inline atomic_t *folio_pincount_ptr(struct folio *folio) -{ - return &folio_page(folio, 1)->compound_pincount; -} - /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. @@ -1669,7 +1723,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because - * we have more tracking data available: the compound_pincount is used + * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. @@ -1680,7 +1734,7 @@ static inline atomic_t *folio_pincount_ptr(struct folio *folio) static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) - return atomic_read(folio_pincount_ptr(folio)) > 0; + return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then @@ -1788,6 +1842,33 @@ static inline long folio_nr_pages(struct folio *folio) #endif } +/* + * compound_nr() returns the number of pages in this potentially compound + * page. compound_nr() can be called on a tail page, and is defined to + * return 1 in that case. + */ +static inline unsigned long compound_nr(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (!test_bit(PG_head, &folio->flags)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif +} + +/** + * thp_nr_pages - The number of regular pages in this huge page. + * @page: The head page of a huge page. + */ +static inline int thp_nr_pages(struct page *page) +{ + return folio_nr_pages((struct folio *)page); +} + /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. @@ -1837,6 +1918,24 @@ static inline size_t folio_size(struct folio *folio) return PAGE_SIZE << folio_order(folio); } +/** + * folio_estimated_sharers - Estimate the number of sharers of a folio. + * @folio: The folio. + * + * folio_estimated_sharers() aims to serve as a function to efficiently + * estimate the number of processes sharing a folio. This is done by + * looking at the precise mapcount of the first subpage in the folio, and + * assuming the other subpages are the same. This may not be true for large + * folios. If you want exact mapcounts for exact calculations, look at + * page_mapcount() or folio_total_mapcount(). + * + * Return: The estimated number of processes sharing a folio. + */ +static inline int folio_estimated_sharers(struct folio *folio) +{ + return page_mapcount(folio_page(folio, 0)); +} + #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { @@ -2052,7 +2151,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); struct mmu_notifier_range; @@ -2204,9 +2303,9 @@ bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); -extern int mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags); +extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. @@ -2838,23 +2937,21 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand); -static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ - return __vma_adjust(vma, start, end, pgoff, insert, NULL); -} -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *); +extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next); +extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); +extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int split_vma(struct mm_struct *, struct vm_area_struct *, - unsigned long addr, int new_below); +extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, @@ -2862,9 +2959,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); - static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, @@ -2912,7 +3006,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, @@ -2920,6 +3014,9 @@ extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU +extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) @@ -3500,6 +3597,11 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, @@ -3617,7 +3719,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) * VM_MAYWRITE as we still want them to be COW-writable. */ if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); + vm_flags_clear(vma, VM_MAYWRITE); } return 0; diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 26dcbda07e923..de1e622dd3660 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void) return current->in_lru_fault; } -#ifdef CONFIG_MEMCG -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return READ_ONCE(lruvec->lrugen.seg); -} -#else -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} -#endif - static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; @@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void) return false; } -static inline int lru_gen_memcg_seg(struct lruvec *lruvec) -{ - return 0; -} - static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c6b28567530ef..088556ef7f444 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -140,30 +140,6 @@ struct page { }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - atomic_t compound_mapcount; - atomic_t subpages_mapcount; - atomic_t compound_pincount; -#ifdef CONFIG_64BIT - unsigned int compound_nr; /* 1 << compound_order */ -#endif - }; - struct { /* Second tail page of transparent huge page */ - unsigned long _compound_pad_1; /* compound_head */ - unsigned long _compound_pad_2; - /* For both global and memcg */ - struct list_head deferred_list; - }; - struct { /* Second tail page of hugetlb page */ - unsigned long _hugetlb_pad_1; /* compound_head */ - void *hugetlb_subpool; - void *hugetlb_cgroup; - void *hugetlb_cgroup_rsvd; - void *hugetlb_hwpoison; - /* No more space on 32-bit: use third tail if more */ }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ @@ -302,20 +278,17 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_flags_1: For large folios, additional page flags. - * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). - * @_subpages_mapcount: Do not use directly, call folio_mapcount(). + * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_flags_2: For alignment. Do not use. - * @_head_2: Points to the folio. Do not use. * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). + * @_deferred_list: Folios to be split under memory pressure. * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -358,14 +331,16 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + /* public: */ unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _compound_mapcount; - atomic_t _subpages_mapcount; + atomic_t _entire_mapcount; + atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -373,10 +348,19 @@ struct folio { struct { unsigned long _flags_2; unsigned long _head_2; + /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; + /* private: the union with struct page is transitional */ + }; + struct { + unsigned long _flags_2a; + unsigned long _head_2a; + /* public: */ + struct list_head _deferred_list; + /* private: the union with struct page is transitional */ }; struct page __page_2; }; @@ -401,53 +385,14 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -FOLIO_MATCH(compound_dtor, _folio_dtor); -FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _compound_mapcount); -FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); -FOLIO_MATCH(compound_pincount, _pincount); -#ifdef CONFIG_64BIT -FOLIO_MATCH(compound_nr, _folio_nr_pages); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); -FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); -FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); -FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); -FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); #undef FOLIO_MATCH -static inline atomic_t *folio_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->compound_mapcount; -} - -static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) -{ - struct page *tail = &folio->page + 1; - return &tail->subpages_mapcount; -} - -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - -static inline atomic_t *subpages_mapcount_ptr(struct page *page) -{ - return &page[1].subpages_mapcount; -} - -static inline atomic_t *compound_pincount_ptr(struct page *page) -{ - return &page[1].compound_pincount; -} - /* * Used for sizing the vmemmap region on some architectures */ @@ -546,7 +491,15 @@ struct vm_area_struct { * See vmf_insert_mixed_prot() for discussion. */ pgprot_t vm_page_prot; - unsigned long vm_flags; /* Flags, see mm.h. */ + + /* + * Flags, see mm.h. + * To modify use vm_flags_{init|reset|set|clear|mod} functions. + */ + union { + const vm_flags_t vm_flags; + vm_flags_t __private __vm_flags; + }; /* * For areas with an address space and backing store, @@ -787,7 +740,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages_sharing). */ unsigned long ksm_merging_pages; /* @@ -795,6 +748,11 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages_sharing; #endif #ifdef CONFIG_LRU_GEN struct { @@ -915,9 +873,7 @@ struct vma_iterator { static inline void vma_iter_init(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long addr) { - vmi->mas.tree = &mm->mm_mt; - vmi->mas.index = addr; - vmi->mas.node = MAS_START; + mas_init(&vmi->mas, &mm->mm_mt, addr); } #ifdef CONFIG_SCHED_MM_CID diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a38..cee1e4b566d80 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags) } unsigned long vm_commit_limit(void); + +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + */ +static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) +{ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) + return true; + + if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) + return true; + + return false; +} + #endif /* _LINUX_MMAN_H */ diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d6c06e1402771..64a3e051c3c43 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -269,7 +269,6 @@ extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { - struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; @@ -514,12 +513,10 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, - struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { - range->vma = vma; range->event = event; range->mm = mm; range->start = start; @@ -530,10 +527,10 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, - struct vm_area_struct *vma, struct mm_struct *mm, - unsigned long start, unsigned long end, void *owner) + struct mm_struct *mm, unsigned long start, + unsigned long end, void *owner) { - mmu_notifier_range_init(range, event, flags, vma, mm, start, end); + mmu_notifier_range_init(range, event, flags, mm, start, end); range->owner = owner; } @@ -659,9 +656,9 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ +#define mmu_notifier_range_init(range,event,flags,mm,start,end) \ _mmu_notifier_range_init(range, start, end) -#define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ +#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a526964b65ce5..ab94985ee7d95 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -370,15 +370,6 @@ struct page_vma_mapped_walk; #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -/* see the comment on MEMCG_NR_GENS */ -enum { - MEMCG_LRU_NOP, - MEMCG_LRU_HEAD, - MEMCG_LRU_TAIL, - MEMCG_LRU_OLD, - MEMCG_LRU_YOUNG, -}; - #ifdef CONFIG_LRU_GEN enum { @@ -559,7 +550,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg); void lru_gen_online_memcg(struct mem_cgroup *memcg); void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op); +void lru_gen_soft_reclaim(struct lruvec *lruvec); #else /* !CONFIG_MEMCG */ @@ -610,7 +601,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg) { } -static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +static inline void lru_gen_soft_reclaim(struct lruvec *lruvec) { } @@ -1223,6 +1214,31 @@ struct deferred_split { }; #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Per NUMA node memory failure handling statistics. + */ +struct memory_failure_stats { + /* + * Number of raw pages poisoned. + * Cases not accounted: memory outside kernel control, offline page, + * arch-specific memory_failure (SGX), hwpoison_filter() filtered + * error events, and unpoison actions from hwpoison_unpoison. + */ + unsigned long total; + /* + * Recovery results of poisoned raw pages handled by memory_failure, + * in sync with mf_result. + * total = ignored + failed + delayed + recovered. + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. + */ + unsigned long ignored; + unsigned long failed; + unsigned long delayed; + unsigned long recovered; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -1368,6 +1384,9 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif +#ifdef CONFIG_MEMORY_FAILURE + struct memory_failure_stats mf_stats; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 69e93a0c12771..a7e3a3405520a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -531,6 +531,7 @@ PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) +#define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 22be4582faaed..bc2e39090a1f5 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -7,15 +7,35 @@ #include struct pglist_data; + +/** + * struct page_ext_operations - per page_ext client operations + * @offset: Offset to the client's data within page_ext. Offset is returned to + * the client by page_ext_init. + * @size: The size of the client data within page_ext. + * @need: Function that returns true if client requires page_ext. + * @init: (optional) Called to initialize client once page_exts are allocated. + * @need_shared_flags: True when client is using shared page_ext->flags + * field. + * + * Each Page Extension client must define page_ext_operations in + * page_ext_ops array. + */ struct page_ext_operations { size_t offset; size_t size; bool (*need)(void); void (*init)(void); + bool need_shared_flags; }; +extern bool deferred_struct_pages; + #ifdef CONFIG_PAGE_EXTENSION +/* + * The page_ext_flags users must set need_shared_flags to true. + */ enum page_ext_flags { PAGE_EXT_OWNER, PAGE_EXT_OWNER_ALLOCATED, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 29e1f9e76eb6d..9f10816837719 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -546,6 +546,26 @@ static inline struct folio *filemap_lock_folio(struct address_space *mapping, return __filemap_get_folio(mapping, index, FGP_LOCK, 0); } +/** + * filemap_grab_folio - grab a folio from the page cache + * @mapping: The address space to search + * @index: The page index + * + * Looks up the page cache entry at @mapping & @index. If no folio is found, + * a new folio is created. The folio is locked, marked as accessed, and + * returned. + * + * Return: A found or created folio. NULL if no folio is found and failed to + * create a folio. + */ +static inline struct folio *filemap_grab_folio(struct address_space *mapping, + pgoff_t index) +{ + return __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping)); +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -719,16 +739,8 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages); -static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) -{ - return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, - nr_pages, pages); -} +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 215eb6c3bdc9f..f582f7213ea52 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,14 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag); -static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, xa_mark_t tag) -{ - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); -} static inline void pagevec_init(struct pagevec *pvec) { @@ -103,6 +95,11 @@ static inline void folio_batch_init(struct folio_batch *fbatch) fbatch->percpu_pvec_drained = false; } +static inline void folio_batch_reinit(struct folio_batch *fbatch) +{ + fbatch->nr = 0; +} + static inline unsigned int folio_batch_count(struct folio_batch *fbatch) { return fbatch->nr; diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index a3aae8d57a421..521a733e21a92 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -152,9 +152,11 @@ __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { - preempt_disable(); + unsigned long flags; + + local_irq_save(flags); fbc->count += amount; - preempt_enable(); + local_irq_restore(flags); } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 1159b25b05427..c63cd44777ec1 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1064,35 +1064,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif -/* - * When replacing an anonymous page by a real (!non) swap entry, we clear - * PG_anon_exclusive from the page and instead remember whether the flag was - * set in the swp pte. During fork(), we have to mark the entry as !exclusive - * (possibly shared). On swapin, we use that information to restore - * PG_anon_exclusive, which is very helpful in cases where we might have - * additional (e.g., FOLL_GET) references on a page and wouldn't be able to - * detect exclusivity. - * - * These functions don't apply to non-swap entries (e.g., migration, hwpoison, - * ...). - */ -#ifndef __HAVE_ARCH_PTE_SWP_EXCLUSIVE -static inline pte_t pte_swp_mkexclusive(pte_t pte) -{ - return pte; -} - -static inline int pte_swp_exclusive(pte_t pte) -{ - return false; -} - -static inline pte_t pte_swp_clear_exclusive(pte_t pte) -{ - return pte; -} -#endif - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) @@ -1214,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, - unsigned long pfn, unsigned long size) + unsigned long pfn, unsigned long size, + bool mm_wr_locked) { } @@ -1232,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, - unsigned long size); + unsigned long size, bool mm_wr_locked); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b155..a4570da03e58a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -194,6 +194,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); +void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, + unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, @@ -201,12 +203,19 @@ void page_remove_rmap(struct page *, struct vm_area_struct *, void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) { - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); + if (compound) { + struct folio *folio = (struct folio *)page; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + atomic_inc(&folio->_entire_mapcount); + } else { + atomic_inc(&page->_mapcount); + } } static inline void page_dup_file_rmap(struct page *page, bool compound) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 8270ad7ae14c2..0e17ae7fbfd35 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm) * lifecycle of this mm, just for simplicity. */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK) + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d500ea967dc73..d09d54be4ffd9 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,14 +92,8 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma, - bool shmem_huge_force) -{ - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, - shmem_huge_force); -} +extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 9ca7798d7a318..a828fbece1ba3 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -1,11 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * A generic stack depot implementation + * Stack depot - a stack trace storage that avoids duplication. + * + * Stack depot is intended to be used by subsystems that need to store and + * later retrieve many potentially duplicated stack traces without wasting + * memory. + * + * For example, KASAN needs to save allocation and free stack traces for each + * object. Storing two stack traces per object requires a lot of memory (e.g. + * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free + * stack traces often repeat, using stack depot allows to save about 100x space. + * + * Stack traces are never removed from stack depot. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ #ifndef _LINUX_STACKDEPOT_H @@ -14,62 +25,143 @@ #include typedef u32 depot_stack_handle_t; + /* * Number of bits in the handle that stack depot doesn't use. Users may store - * information in them. + * information in them via stack_depot_set/get_extra_bits. */ #define STACK_DEPOT_EXTRA_BITS 5 -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - unsigned int extra_bits, - gfp_t gfp_flags, bool can_alloc); - /* - * Every user of stack depot has to call stack_depot_init() during its own init - * when it's decided that it will be calling stack_depot_save() later. This is - * recommended for e.g. modules initialized later in the boot process, when - * slab_is_available() is true. - * - * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot - * enabled as part of mm_init(), for subsystems where it's known at compile time - * that stack depot will be used. - * - * Another alternative is to call stack_depot_want_early_init(), when the - * decision to use stack depot is taken e.g. when evaluating kernel boot - * parameters, which precedes the enablement point in mm_init(). - * - * stack_depot_init() and stack_depot_want_early_init() can be called regardless - * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print - * functions should only be called from code that makes sure CONFIG_STACKDEPOT - * is enabled. + * Using stack depot requires its initialization, which can be done in 3 ways: + * + * 1. Selecting CONFIG_STACKDEPOT_ALWAYS_INIT. This option is suitable in + * scenarios where it's known at compile time that stack depot will be used. + * Enabling this config makes the kernel initialize stack depot in mm_init(). + * + * 2. Calling stack_depot_request_early_init() during early boot, before + * stack_depot_early_init() in mm_init() completes. For example, this can + * be done when evaluating kernel boot parameters. + * + * 3. Calling stack_depot_init(). Possible after boot is complete. This option + * is recommended for modules initialized later in the boot process, after + * mm_init() completes. + * + * stack_depot_init() and stack_depot_request_early_init() can be called + * regardless of whether CONFIG_STACKDEPOT is enabled and are no-op when this + * config is disabled. The save/fetch/print stack depot functions can only be + * called from the code that makes sure CONFIG_STACKDEPOT is enabled _and_ + * initializes stack depot via one of the ways listed above. */ #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); -void __init stack_depot_want_early_init(void); +void __init stack_depot_request_early_init(void); -/* This is supposed to be called only from mm_init() */ +/* Must be only called from mm_init(). */ int __init stack_depot_early_init(void); #else static inline int stack_depot_init(void) { return 0; } -static inline void stack_depot_want_early_init(void) { } +static inline void stack_depot_request_early_init(void) { } static inline int stack_depot_early_init(void) { return 0; } #endif +/** + * __stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * @can_alloc: Allocate stack slabs (increased chance of failure if false) + * + * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is + * %true, stack depot can replenish the stack slab pool in case no space is left + * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids + * any allocations and fails if no space is left to store the stack trace. + * + * If the provided stack trace comes from the interrupt context, only the part + * up to the interrupt entry is saved. + * + * Context: Any context, but setting @can_alloc to %false is required if + * alloc_pages() cannot be used from the current context. Currently + * this is the case for contexts where neither %GFP_ATOMIC nor + * %GFP_NOWAIT can be used (NMI, raw_spin_lock). + * + * Return: Handle of the stack struct stored in depot, 0 on failure + */ +depot_stack_handle_t __stack_depot_save(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, bool can_alloc); + +/** + * stack_depot_save - Save a stack trace to stack depot + * + * @entries: Pointer to the stack trace + * @nr_entries: Number of frames in the stack + * @alloc_flags: Allocation GFP flags + * + * Context: Contexts where allocations via alloc_pages() are allowed. + * See __stack_depot_save() for more details. + * + * Return: Handle of the stack trace stored in depot, 0 on failure + */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); +/** + * stack_depot_fetch - Fetch a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * @entries: Pointer to store the address of the stack trace + * + * Return: Number of frames for the fetched stack + */ unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); +/** + * stack_depot_print - Print a stack trace from stack depot + * + * @stack: Stack depot handle returned from stack_depot_save() + */ +void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_snprint - Print a stack trace from stack depot into a buffer + * + * @handle: Stack depot handle returned from stack_depot_save() + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed + */ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); -void stack_depot_print(depot_stack_handle_t stack); +/** + * stack_depot_set_extra_bits - Set extra bits in a stack depot handle + * + * @handle: Stack depot handle + * @extra_bits: Value to set the extra bits + * + * Return: Stack depot handle with extra bits set + * + * Stack depot handles have a few unused bits, which can be used for storing + * user-specific information. These bits are transparent to the stack depot. + */ +depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, + unsigned int extra_bits); + +/** + * stack_depot_get_extra_bits - Retrieve extra bits from a stack depot handle + * + * @handle: Stack depot handle with extra bits saved + * + * Return: Extra bits retrieved from the stack depot handle + */ +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); #endif diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 43db6e47503c7..6bb460c3e818b 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -2,6 +2,8 @@ #ifndef _LINUX_HELPER_MACROS_H_ #define _LINUX_HELPER_MACROS_H_ +#include + #define __find_closest(x, a, as, op) \ ({ \ typeof(as) __fc_i, __fc_as = (as) - 1; \ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 096d48aa34373..69250efa03d13 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -76,6 +76,7 @@ struct vmap_area { unsigned long subtree_max_size; /* in "free" tree */ struct vm_struct *vm; /* in "busy" tree */ }; + unsigned long flags; /* mark type of vm_map_ram area */ }; /* archs that select HAVE_ARCH_HUGE_VMAP should override one or more of these */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 2554b71765e9d..46020373e155b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -207,7 +207,7 @@ static inline void wait_on_inode(struct inode *inode) #include #include -void __inode_attach_wb(struct inode *inode, struct page *page); +void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); @@ -222,16 +222,16 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest - * @page: page being dirtied (may be NULL) + * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) - __inode_attach_wb(inode, page); + __inode_attach_wb(inode, folio); } /** @@ -290,7 +290,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) #else /* CONFIG_CGROUP_WRITEBACK */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) +static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } @@ -366,7 +366,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, bool wb_over_bg_thresh(struct bdi_writeback *wb); -typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, +typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, void *data); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bcc..741703b45f61a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1643,7 +1643,8 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. - * This is advanced functionality and is only needed by the page cache. + * This is advanced functionality and is only needed by the page + * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 3e6fb05852f9a..46cce509957ba 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -36,7 +36,8 @@ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ EM( SCAN_TRUNCATED, "truncated") \ - EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ + EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ + EMe(SCAN_COPY_MC, "copy_poisoned_page") \ #undef EM #undef EMe diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 412b5a46374c0..9db52bc4ce192 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index f12f3df124687..99b5592cf297f 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -281,6 +281,12 @@ struct prctl_mm_map { # define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ +/* Memory deny write / execute */ +#define PR_SET_MDWE 67 +# define PR_MDWE_REFUSE_EXEC_GAIN 1 + +#define PR_GET_MDWE 68 + #define PR_SET_VMA 0x53564d41 # define PR_SET_VMA_ANON_NAME 0 diff --git a/init/initramfs.c b/init/initramfs.c index 62321883fe613..f6c112e30bd47 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -571,8 +572,7 @@ __setup("keepinitrd", keepinitrd_setup); static bool __initdata initramfs_async = true; static int __init initramfs_async_setup(char *str) { - strtobool(str, &initramfs_async); - return 1; + return kstrtobool(str, &initramfs_async) == 0; } __setup("initramfs_async=", initramfs_async_setup); diff --git a/init/main.c b/init/main.c index 669cb892e6c17..4425d1783d5c2 100644 --- a/init/main.c +++ b/init/main.c @@ -855,8 +855,8 @@ static void __init mm_init(void) pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - /* Should be run after vmap initialization */ - if (early_page_ext_enabled()) + /* If no deferred init page_ext now, as vmap is fully initialized */ + if (!deferred_struct_pages) page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); @@ -1628,7 +1628,7 @@ static noinline void __init kernel_init_freeable(void) padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - if (!early_page_ext_enabled()) + if (deferred_struct_pages) page_ext_init(); do_basic_setup(); diff --git a/ipc/shm.c b/ipc/shm.c index bd2fcc4d454e0..60e45e7045d44 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1786,8 +1786,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1811,8 +1811,8 @@ long ksys_shmdt(char __user *shmaddr) if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - mas_pause(&vmi.mas); + do_vma_munmap(&vmi, vma, vma->vm_start, vma->vm_end, + NULL, false); } vma = vma_next(&vmi); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aafa..8732e0aadf366 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma */ return -EPERM; } else { - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 99417b3875477..8df53c93eaa83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -882,10 +882,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; - vma->vm_flags &= ~VM_MAYEXEC; + vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 87ef6096823f3..755f5f08ab383 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(folio, _folio_dtor); + VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2a683e079258f..f79fd8b87f75e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6568,7 +6568,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c9..59887c69d54c1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -22,7 +22,6 @@ #include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ -#include "../../mm/internal.h" /* munlock_vma_page */ #include #include #include @@ -161,7 +160,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, int err; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { @@ -1352,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) } /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. diff --git a/kernel/fork.c b/kernel/fork.c index 82b2b5846aae1..8d7c736a15a3c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * orig->shared.rb may be modified concurrently, but the clone * will be reinitialized. */ - *new = data_race(*orig); + data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); dup_anon_vma_name(orig, new); } @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge = 0; LIST_HEAD(uf); - MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); - MA_STATE(mas, &mm->mm_mt, 0, 0); + VMA_ITERATOR(old_vmi, oldmm, 0); + VMA_ITERATOR(vmi, mm, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); + retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count); if (retval) goto out; - mas_for_each(&old_mas, mpnt, ULONG_MAX) { + for_each_vma(old_vmi, mpnt) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + vm_flags_clear(tmp, VM_LOCKED_MASK); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ - mas.index = tmp->vm_start; - mas.last = tmp->vm_end - 1; - mas_store(&mas, tmp); - if (mas_is_err(&mas)) - goto fail_nomem_mas_store; + if (vma_iter_bulk_store(&vmi, tmp)) + goto fail_nomem_vmi_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: - mas_destroy(&mas); + vma_iter_free(&vmi); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, uprobe_end_dup_mmap(); return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store: unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a050..84c717337df08 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) goto exit; } spin_unlock_irqrestore(&kcov->lock, flags); - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); for (off = 0; off < size; off += PAGE_SIZE) { page = vmalloc_to_page(kcov->area + off); res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/kexec.c b/kernel/kexec.c index cb8e6e6f983c7..92d301f987766 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -190,10 +190,12 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, static inline int kexec_load_check(unsigned long nr_segments, unsigned long flags) { + int image_type = (flags & KEXEC_ON_CRASH) ? + KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT; int result; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + if (!kexec_load_permitted(image_type)) return -EPERM; /* Permit LSMs and IMA to fail the kexec */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b1cf259854ca1..3d578c6fefee3 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -921,10 +921,64 @@ int kimage_load_segment(struct kimage *image, return result; } +struct kexec_load_limit { + /* Mutex protects the limit count. */ + struct mutex mutex; + int limit; +}; + +static struct kexec_load_limit load_limit_reboot = { + .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), + .limit = -1, +}; + +static struct kexec_load_limit load_limit_panic = { + .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), + .limit = -1, +}; + struct kimage *kexec_image; struct kimage *kexec_crash_image; -int kexec_load_disabled; +static int kexec_load_disabled; + #ifdef CONFIG_SYSCTL +static int kexec_limit_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct kexec_load_limit *limit = table->data; + int val; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + }; + int ret; + + if (write) { + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (val < 0) + return -EINVAL; + + mutex_lock(&limit->mutex); + if (limit->limit != -1 && val >= limit->limit) + ret = -EINVAL; + else + limit->limit = val; + mutex_unlock(&limit->mutex); + + return ret; + } + + mutex_lock(&limit->mutex); + val = limit->limit; + mutex_unlock(&limit->mutex); + + return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} + static struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", @@ -936,6 +990,18 @@ static struct ctl_table kexec_core_sysctls[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, + { + .procname = "kexec_load_limit_panic", + .data = &load_limit_panic, + .mode = 0644, + .proc_handler = kexec_limit_handler, + }, + { + .procname = "kexec_load_limit_reboot", + .data = &load_limit_reboot, + .mode = 0644, + .proc_handler = kexec_limit_handler, + }, { } }; @@ -947,6 +1013,32 @@ static int __init kexec_core_sysctl_init(void) late_initcall(kexec_core_sysctl_init); #endif +bool kexec_load_permitted(int kexec_image_type) +{ + struct kexec_load_limit *limit; + + /* + * Only the superuser can use the kexec syscall and if it has not + * been disabled. + */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return false; + + /* Check limit counter and decrease it.*/ + limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? + &load_limit_panic : &load_limit_reboot; + mutex_lock(&limit->mutex); + if (!limit->limit) { + mutex_unlock(&limit->mutex); + return false; + } + if (limit->limit != -1) + limit->limit--; + mutex_unlock(&limit->mutex); + + return true; +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index dd5983010b7b0..f1a0e4e3fb5cd 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -326,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) { - int ret = 0, i; + int image_type = (flags & KEXEC_FILE_ON_CRASH) ? + KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT; struct kimage **dest_image, *image; + int ret = 0, i; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + if (!kexec_load_permitted(image_type)) return -EPERM; /* Make sure we have a legal set of flags */ @@ -342,11 +344,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (!kexec_trylock()) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) { + if (image_type == KEXEC_TYPE_CRASH) { dest_image = &kexec_crash_image; if (kexec_crash_image) arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; } if (flags & KEXEC_FILE_UNLOAD) diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a29325..7e6751b29101e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker); * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. + * + * Note that this function is not responsible for handling delayed work, so + * caller should be responsible for queuing or canceling all delayed work items + * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { @@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) kthread_flush_worker(worker); kthread_stop(task); + WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9c..9aa70ae53d24d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) return -EINVAL; vma->vm_ops = &relay_file_mmap_ops; - vma->vm_flags |= VM_DONTEXPAND; + vm_flags_set(vma, VM_DONTEXPAND); vma->vm_private_data = buf; return 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c46485d65d79..32d0f5acaa7ef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; - MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; long pages, virtpages; + struct vma_iterator vmi; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, mm, start); + vma = vma_next(&vmi); if (!vma) { reset_ptenuma_scan(p); start = 0; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_set(&vmi, start); + vma = vma_next(&vmi); } - for (; vma; vma = mas_find(&mas, ULONG_MAX)) { + do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); - } + } for_each_vma(vmi, vma); out: /* diff --git a/kernel/sys.c b/kernel/sys.c index 8437cca2c2b44..5f40ccb6487ef 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2350,6 +2350,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg3 || arg4 || arg5) + return -EINVAL; + + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + return -EINVAL; + + if (bits & PR_MDWE_REFUSE_EXEC_GAIN) + set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return -EPERM; /* Cannot unset the flag */ + + return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? + PR_MDWE_REFUSE_EXEC_GAIN : 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,6 +2652,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_MDWE: + error = prctl_set_mdwe(arg2, arg3, arg4, arg5); + break; + case PR_GET_MDWE: + error = prctl_get_mdwe(arg2, arg3, arg4, arg5); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c5..1d8e47bed3f11 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -229,7 +229,7 @@ void __put_user_ns(struct user_namespace *ns) EXPORT_SYMBOL(__put_user_ns); /** - * idmap_key struct holds the information necessary to find an idmapping in a + * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b7d955b61f769..e1b160a0474db 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -752,77 +752,6 @@ config SHRINKER_DEBUG visibility into the kernel memory shrinkers subsystem. Disable it to avoid an extra memory footprint. -config HAVE_DEBUG_KMEMLEAK - bool - -config DEBUG_KMEMLEAK - bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK - select DEBUG_FS - select STACKTRACE if STACKTRACE_SUPPORT - select KALLSYMS - select CRC32 - select STACKDEPOT - select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF - help - Say Y here if you want to enable the memory leak - detector. The memory allocation/freeing is traced in a way - similar to the Boehm's conservative garbage collector, the - difference being that the orphan objects are not freed but - only shown in /sys/kernel/debug/kmemleak. Enabling this - feature will introduce an overhead to memory - allocations. See Documentation/dev-tools/kmemleak.rst for more - details. - - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances - of finding leaks due to the slab objects poisoning. - - In order to access the kmemleak file, debugfs needs to be - mounted (usually at /sys/kernel/debug). - -config DEBUG_KMEMLEAK_MEM_POOL_SIZE - int "Kmemleak memory pool size" - depends on DEBUG_KMEMLEAK - range 200 1000000 - default 16000 - help - Kmemleak must track all the memory allocations to avoid - reporting false positives. Since memory may be allocated or - freed before kmemleak is fully initialised, use a static pool - of metadata objects to track such callbacks. After kmemleak is - fully initialised, this memory pool acts as an emergency one - if slab allocations fail. - -config DEBUG_KMEMLEAK_TEST - tristate "Simple test for the kernel memory leak detector" - depends on DEBUG_KMEMLEAK && m - help - This option enables a module that explicitly leaks memory. - - If unsure, say N. - -config DEBUG_KMEMLEAK_DEFAULT_OFF - bool "Default kmemleak to off" - depends on DEBUG_KMEMLEAK - help - Say Y here to disable kmemleak by default. It can then be enabled - on the command line via kmemleak=on. - -config DEBUG_KMEMLEAK_AUTO_SCAN - bool "Enable kmemleak auto scan thread on boot up" - default y - depends on DEBUG_KMEMLEAK - help - Depending on the cpu, kmemleak scan may be cpu intensive and can - stall user tasks at times. This option enables/disables automatic - kmemleak scan at boot up. - - Say N here to disable kmemleak auto scan thread to stop automatic - scanning. Disabling this option disables automatic reporting of - memory leaks. - - If unsure, say Y. - config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL && !IA64 @@ -1256,13 +1185,16 @@ config DEBUG_TIMEKEEPING config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPTION && TRACE_IRQFLAGS_SUPPORT - default y help If you say Y here then the kernel will use a debug variant of the commonly used smp_processor_id() function and will print warnings if kernel code uses it in a preemption-unsafe way. Also, the kernel will detect preemption count underflows. + This option has potential to introduce high runtime overhead, + depending on workload as it triggers debugging routines for each + this_cpu operation. It should only be used for debugging purposes. + menu "Lock Debugging (spinlocks, mutexes, etc...)" config LOCK_DEBUGGING_SUPPORT @@ -2109,6 +2041,41 @@ menuconfig RUNTIME_TESTING_MENU if RUNTIME_TESTING_MENU +config TEST_DHRY + tristate "Dhrystone benchmark test" + help + Enable this to include the Dhrystone 2.1 benchmark. This test + calculates the number of Dhrystones per second, and the number of + DMIPS (Dhrystone MIPS) obtained when the Dhrystone score is divided + by 1757 (the number of Dhrystones per second obtained on the VAX + 11/780, nominally a 1 MIPS machine). + + To run the benchmark, it needs to be enabled explicitly, either from + the kernel command line (when built-in), or from userspace (when + built-in or modular. + + Run once during kernel boot: + + test_dhry.run + + Set number of iterations from kernel command line: + + test_dhry.iterations= + + Set number of iterations from userspace: + + echo > /sys/module/test_dhry/parameters/iterations + + Trigger manual run from userspace: + + echo y > /sys/module/test_dhry/parameters/run + + If the number of iterations is <= 0, the test will devise a suitable + number of iterations (test runs for at least 2s) automatically. + This process takes ca. 4s. + + If unsure, say N. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_FS diff --git a/lib/Makefile b/lib/Makefile index 5ec2d0310fe00..fd79becdcebc4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -57,6 +57,8 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_FIND_BIT_BENCHMARK) += find_bit_benchmark.o obj-$(CONFIG_TEST_BPF) += test_bpf.o +test_dhry-objs := dhry_1.o dhry_2.o dhry_run.o +obj-$(CONFIG_TEST_DHRY) += test_dhry.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror diff --git a/lib/dhry.h b/lib/dhry.h new file mode 100644 index 0000000000000..e1a4db8e252c3 --- /dev/null +++ b/lib/dhry.h @@ -0,0 +1,358 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + **************************************************************************** + * + * "DHRYSTONE" Benchmark Program + * ----------------------------- + * + * Version: C, Version 2.1 + * + * File: dhry.h (part 1 of 3) + * + * Date: May 25, 1988 + * + * Author: Reinhold P. Weicker + * Siemens AG, AUT E 51 + * Postfach 3220 + * 8520 Erlangen + * Germany (West) + * Phone: [+49]-9131-7-20330 + * (8-17 Central European Time) + * Usenet: ..!mcsun!unido!estevax!weicker + * + * Original Version (in Ada) published in + * "Communications of the ACM" vol. 27., no. 10 (Oct. 1984), + * pp. 1013 - 1030, together with the statistics + * on which the distribution of statements etc. is based. + * + * In this C version, the following C library functions are used: + * - strcpy, strcmp (inside the measurement loop) + * - printf, scanf (outside the measurement loop) + * In addition, Berkeley UNIX system calls "times ()" or "time ()" + * are used for execution time measurement. For measurements + * on other systems, these calls have to be changed. + * + * Collection of Results: + * Reinhold Weicker (address see above) and + * + * Rick Richardson + * PC Research. Inc. + * 94 Apple Orchard Drive + * Tinton Falls, NJ 07724 + * Phone: (201) 389-8963 (9-17 EST) + * Usenet: ...!uunet!pcrat!rick + * + * Please send results to Rick Richardson and/or Reinhold Weicker. + * Complete information should be given on hardware and software used. + * Hardware information includes: Machine type, CPU, type and size + * of caches; for microprocessors: clock frequency, memory speed + * (number of wait states). + * Software information includes: Compiler (and runtime library) + * manufacturer and version, compilation switches, OS version. + * The Operating System version may give an indication about the + * compiler; Dhrystone itself performs no OS calls in the measurement loop. + * + * The complete output generated by the program should be mailed + * such that at least some checks for correctness can be made. + * + *************************************************************************** + * + * History: This version C/2.1 has been made for two reasons: + * + * 1) There is an obvious need for a common C version of + * Dhrystone, since C is at present the most popular system + * programming language for the class of processors + * (microcomputers, minicomputers) where Dhrystone is used most. + * There should be, as far as possible, only one C version of + * Dhrystone such that results can be compared without + * restrictions. In the past, the C versions distributed + * by Rick Richardson (Version 1.1) and by Reinhold Weicker + * had small (though not significant) differences. + * + * 2) As far as it is possible without changes to the Dhrystone + * statistics, optimizing compilers should be prevented from + * removing significant statements. + * + * This C version has been developed in cooperation with + * Rick Richardson (Tinton Falls, NJ), it incorporates many + * ideas from the "Version 1.1" distributed previously by + * him over the UNIX network Usenet. + * I also thank Chaim Benedelac (National Semiconductor), + * David Ditzel (SUN), Earl Killian and John Mashey (MIPS), + * Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley) + * for their help with comments on earlier versions of the + * benchmark. + * + * Changes: In the initialization part, this version follows mostly + * Rick Richardson's version distributed via Usenet, not the + * version distributed earlier via floppy disk by Reinhold Weicker. + * As a concession to older compilers, names have been made + * unique within the first 8 characters. + * Inside the measurement loop, this version follows the + * version previously distributed by Reinhold Weicker. + * + * At several places in the benchmark, code has been added, + * but within the measurement loop only in branches that + * are not executed. The intention is that optimizing compilers + * should be prevented from moving code out of the measurement + * loop, or from removing code altogether. Since the statements + * that are executed within the measurement loop have NOT been + * changed, the numbers defining the "Dhrystone distribution" + * (distribution of statements, operand types and locality) + * still hold. Except for sophisticated optimizing compilers, + * execution times for this version should be the same as + * for previous versions. + * + * Since it has proven difficult to subtract the time for the + * measurement loop overhead in a correct way, the loop check + * has been made a part of the benchmark. This does have + * an impact - though a very minor one - on the distribution + * statistics which have been updated for this version. + * + * All changes within the measurement loop are described + * and discussed in the companion paper "Rationale for + * Dhrystone version 2". + * + * Because of the self-imposed limitation that the order and + * distribution of the executed statements should not be + * changed, there are still cases where optimizing compilers + * may not generate code for some statements. To a certain + * degree, this is unavoidable for small synthetic benchmarks. + * Users of the benchmark are advised to check code listings + * whether code is generated for all statements of Dhrystone. + * + * Version 2.1 is identical to version 2.0 distributed via + * the UNIX network Usenet in March 1988 except that it corrects + * some minor deficiencies that were found by users of version 2.0. + * The only change within the measurement loop is that a + * non-executed "else" part was added to the "if" statement in + * Func_3, and a non-executed "else" part removed from Proc_3. + * + *************************************************************************** + * + * Compilation model and measurement (IMPORTANT): + * + * This C version of Dhrystone consists of three files: + * - dhry.h (this file, containing global definitions and comments) + * - dhry_1.c (containing the code corresponding to Ada package Pack_1) + * - dhry_2.c (containing the code corresponding to Ada package Pack_2) + * + * The following "ground rules" apply for measurements: + * - Separate compilation + * - No procedure merging + * - Otherwise, compiler optimizations are allowed but should be indicated + * - Default results are those without register declarations + * See the companion paper "Rationale for Dhrystone Version 2" for a more + * detailed discussion of these ground rules. + * + * For 16-Bit processors (e.g. 80186, 80286), times for all compilation + * models ("small", "medium", "large" etc.) should be given if possible, + * together with a definition of these models for the compiler system used. + * + ************************************************************************** + * + * Dhrystone (C version) statistics: + * + * [Comment from the first distribution, updated for version 2. + * Note that because of language differences, the numbers are slightly + * different from the Ada version.] + * + * The following program contains statements of a high level programming + * language (here: C) in a distribution considered representative: + * + * assignments 52 (51.0 %) + * control statements 33 (32.4 %) + * procedure, function calls 17 (16.7 %) + * + * 103 statements are dynamically executed. The program is balanced with + * respect to the three aspects: + * + * - statement type + * - operand type + * - operand locality + * operand global, local, parameter, or constant. + * + * The combination of these three aspects is balanced only approximately. + * + * 1. Statement Type: + * ----------------- number + * + * V1 = V2 9 + * (incl. V1 = F(..) + * V = Constant 12 + * Assignment, 7 + * with array element + * Assignment, 6 + * with record component + * -- + * 34 34 + * + * X = Y +|-|"&&"|"|" Z 5 + * X = Y +|-|"==" Constant 6 + * X = X +|- 1 3 + * X = Y *|/ Z 2 + * X = Expression, 1 + * two operators + * X = Expression, 1 + * three operators + * -- + * 18 18 + * + * if .... 14 + * with "else" 7 + * without "else" 7 + * executed 3 + * not executed 4 + * for ... 7 | counted every time + * while ... 4 | the loop condition + * do ... while 1 | is evaluated + * switch ... 1 + * break 1 + * declaration with 1 + * initialization + * -- + * 34 34 + * + * P (...) procedure call 11 + * user procedure 10 + * library procedure 1 + * X = F (...) + * function call 6 + * user function 5 + * library function 1 + * -- + * 17 17 + * --- + * 103 + * + * The average number of parameters in procedure or function calls + * is 1.82 (not counting the function values as implicit parameters). + * + * + * 2. Operators + * ------------ + * number approximate + * percentage + * + * Arithmetic 32 50.8 + * + * + 21 33.3 + * - 7 11.1 + * * 3 4.8 + * / (int div) 1 1.6 + * + * Comparison 27 42.8 + * + * == 9 14.3 + * /= 4 6.3 + * > 1 1.6 + * < 3 4.8 + * >= 1 1.6 + * <= 9 14.3 + * + * Logic 4 6.3 + * + * && (AND-THEN) 1 1.6 + * | (OR) 1 1.6 + * ! (NOT) 2 3.2 + * + * -- ----- + * 63 100.1 + * + * + * 3. Operand Type (counted once per operand reference): + * --------------- + * number approximate + * percentage + * + * Integer 175 72.3 % + * Character 45 18.6 % + * Pointer 12 5.0 % + * String30 6 2.5 % + * Array 2 0.8 % + * Record 2 0.8 % + * --- ------- + * 242 100.0 % + * + * When there is an access path leading to the final operand (e.g. a record + * component), only the final data type on the access path is counted. + * + * + * 4. Operand Locality: + * ------------------- + * number approximate + * percentage + * + * local variable 114 47.1 % + * global variable 22 9.1 % + * parameter 45 18.6 % + * value 23 9.5 % + * reference 22 9.1 % + * function result 6 2.5 % + * constant 55 22.7 % + * --- ------- + * 242 100.0 % + * + * + * The program does not compute anything meaningful, but it is syntactically + * and semantically correct. All variables have a value assigned to them + * before they are used as a source operand. + * + * There has been no explicit effort to account for the effects of a + * cache, or to balance the use of long or short displacements for code or + * data. + * + *************************************************************************** + */ + +typedef enum { + Ident_1, + Ident_2, + Ident_3, + Ident_4, + Ident_5 +} Enumeration; /* for boolean and enumeration types in Ada, Pascal */ + +/* General definitions: */ + +typedef int One_Thirty; +typedef int One_Fifty; +typedef char Capital_Letter; +typedef int Boolean; +typedef char Str_30[31]; +typedef int Arr_1_Dim[50]; +typedef int Arr_2_Dim[50][50]; + +typedef struct record { + struct record *Ptr_Comp; + Enumeration Discr; + union { + struct { + Enumeration Enum_Comp; + int Int_Comp; + char Str_Comp[31]; + } var_1; + struct { + Enumeration E_Comp_2; + char Str_2_Comp[31]; + } var_2; + struct { + char Ch_1_Comp; + char Ch_2_Comp; + } var_3; + } variant; +} Rec_Type, *Rec_Pointer; + + +extern int Int_Glob; +extern char Ch_1_Glob; + +void Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par); +void Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, + One_Fifty *Int_Par_Ref); +void Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, + int Int_1_Par_Val, int Int_2_Par_Val); +Enumeration Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val); +Boolean Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref); + +int dhry(int n); diff --git a/lib/dhry_1.c b/lib/dhry_1.c new file mode 100644 index 0000000000000..83247106824cc --- /dev/null +++ b/lib/dhry_1.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + **************************************************************************** + * + * "DHRYSTONE" Benchmark Program + * ----------------------------- + * + * Version: C, Version 2.1 + * + * File: dhry_1.c (part 2 of 3) + * + * Date: May 25, 1988 + * + * Author: Reinhold P. Weicker + * + **************************************************************************** + */ + +#include "dhry.h" + +#include +#include +#include + +/* Global Variables: */ + +int Int_Glob; +char Ch_1_Glob; + +static Rec_Pointer Ptr_Glob, Next_Ptr_Glob; +static Boolean Bool_Glob; +static char Ch_2_Glob; +static int Arr_1_Glob[50]; +static int Arr_2_Glob[50][50]; + +static void Proc_3(Rec_Pointer *Ptr_Ref_Par) +/******************/ +/* executed once */ +/* Ptr_Ref_Par becomes Ptr_Glob */ +{ + if (Ptr_Glob) { + /* then, executed */ + *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp; + } + Proc_7(10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp); +} /* Proc_3 */ + + +static void Proc_1(Rec_Pointer Ptr_Val_Par) +/******************/ +/* executed once */ +{ + Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp; + /* == Ptr_Glob_Next */ + /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */ + /* corresponds to "rename" in Ada, "with" in Pascal */ + + *Ptr_Val_Par->Ptr_Comp = *Ptr_Glob; + Ptr_Val_Par->variant.var_1.Int_Comp = 5; + Next_Record->variant.var_1.Int_Comp = + Ptr_Val_Par->variant.var_1.Int_Comp; + Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp; + Proc_3(&Next_Record->Ptr_Comp); + /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp == Ptr_Glob->Ptr_Comp */ + if (Next_Record->Discr == Ident_1) { + /* then, executed */ + Next_Record->variant.var_1.Int_Comp = 6; + Proc_6(Ptr_Val_Par->variant.var_1.Enum_Comp, + &Next_Record->variant.var_1.Enum_Comp); + Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp; + Proc_7(Next_Record->variant.var_1.Int_Comp, 10, + &Next_Record->variant.var_1.Int_Comp); + } else { + /* not executed */ + *Ptr_Val_Par = *Ptr_Val_Par->Ptr_Comp; + } +} /* Proc_1 */ + + +static void Proc_2(One_Fifty *Int_Par_Ref) +/******************/ +/* executed once */ +/* *Int_Par_Ref == 1, becomes 4 */ +{ + One_Fifty Int_Loc; + Enumeration Enum_Loc; + + Int_Loc = *Int_Par_Ref + 10; + do { + /* executed once */ + if (Ch_1_Glob == 'A') { + /* then, executed */ + Int_Loc -= 1; + *Int_Par_Ref = Int_Loc - Int_Glob; + Enum_Loc = Ident_1; + } /* if */ + } while (Enum_Loc != Ident_1); /* true */ +} /* Proc_2 */ + + +static void Proc_4(void) +/*******/ +/* executed once */ +{ + Boolean Bool_Loc; + + Bool_Loc = Ch_1_Glob == 'A'; + Bool_Glob = Bool_Loc | Bool_Glob; + Ch_2_Glob = 'B'; +} /* Proc_4 */ + + +static void Proc_5(void) +/*******/ +/* executed once */ +{ + Ch_1_Glob = 'A'; + Bool_Glob = false; +} /* Proc_5 */ + + +int dhry(int n) +/*****/ + + /* main program, corresponds to procedures */ + /* Main and Proc_0 in the Ada version */ +{ + One_Fifty Int_1_Loc; + One_Fifty Int_2_Loc; + One_Fifty Int_3_Loc; + char Ch_Index; + Enumeration Enum_Loc; + Str_30 Str_1_Loc; + Str_30 Str_2_Loc; + int Run_Index; + int Number_Of_Runs; + ktime_t Begin_Time, End_Time; + u32 User_Time; + + /* Initializations */ + + Next_Ptr_Glob = (Rec_Pointer)kzalloc(sizeof(Rec_Type), GFP_KERNEL); + Ptr_Glob = (Rec_Pointer)kzalloc(sizeof(Rec_Type), GFP_KERNEL); + + Ptr_Glob->Ptr_Comp = Next_Ptr_Glob; + Ptr_Glob->Discr = Ident_1; + Ptr_Glob->variant.var_1.Enum_Comp = Ident_3; + Ptr_Glob->variant.var_1.Int_Comp = 40; + strcpy(Ptr_Glob->variant.var_1.Str_Comp, + "DHRYSTONE PROGRAM, SOME STRING"); + strcpy(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); + + Arr_2_Glob[8][7] = 10; + /* Was missing in published program. Without this statement, */ + /* Arr_2_Glob[8][7] would have an undefined value. */ + /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */ + /* overflow may occur for this array element. */ + + pr_debug("Dhrystone Benchmark, Version 2.1 (Language: C)\n"); + + Number_Of_Runs = n; + + pr_debug("Execution starts, %d runs through Dhrystone\n", + Number_Of_Runs); + + /***************/ + /* Start timer */ + /***************/ + + Begin_Time = ktime_get(); + + for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) { + Proc_5(); + Proc_4(); + /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */ + Int_1_Loc = 2; + Int_2_Loc = 3; + strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); + Enum_Loc = Ident_2; + Bool_Glob = !Func_2(Str_1_Loc, Str_2_Loc); + /* Bool_Glob == 1 */ + while (Int_1_Loc < Int_2_Loc) { + /* loop body executed once */ + Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc; + /* Int_3_Loc == 7 */ + Proc_7(Int_1_Loc, Int_2_Loc, &Int_3_Loc); + /* Int_3_Loc == 7 */ + Int_1_Loc += 1; + } /* while */ + /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ + Proc_8(Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc); + /* Int_Glob == 5 */ + Proc_1(Ptr_Glob); + for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index) { + /* loop body executed twice */ + if (Enum_Loc == Func_1(Ch_Index, 'C')) { + /* then, not executed */ + Proc_6(Ident_1, &Enum_Loc); + strcpy(Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING"); + Int_2_Loc = Run_Index; + Int_Glob = Run_Index; + } + } + /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ + Int_2_Loc = Int_2_Loc * Int_1_Loc; + Int_1_Loc = Int_2_Loc / Int_3_Loc; + Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc; + /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */ + Proc_2(&Int_1_Loc); + /* Int_1_Loc == 5 */ + + } /* loop "for Run_Index" */ + + /**************/ + /* Stop timer */ + /**************/ + + End_Time = ktime_get(); + +#define dhry_assert_int_eq(val, expected) \ + if (val != expected) \ + pr_err("%s: %d (FAIL, expected %d)\n", #val, val, \ + expected); \ + else \ + pr_debug("%s: %d (OK)\n", #val, val) + +#define dhry_assert_char_eq(val, expected) \ + if (val != expected) \ + pr_err("%s: %c (FAIL, expected %c)\n", #val, val, \ + expected); \ + else \ + pr_debug("%s: %c (OK)\n", #val, val) + +#define dhry_assert_string_eq(val, expected) \ + if (strcmp(val, expected)) \ + pr_err("%s: %s (FAIL, expected %s)\n", #val, val, \ + expected); \ + else \ + pr_debug("%s: %s (OK)\n", #val, val) + + pr_debug("Execution ends\n"); + pr_debug("Final values of the variables used in the benchmark:\n"); + dhry_assert_int_eq(Int_Glob, 5); + dhry_assert_int_eq(Bool_Glob, 1); + dhry_assert_char_eq(Ch_1_Glob, 'A'); + dhry_assert_char_eq(Ch_2_Glob, 'B'); + dhry_assert_int_eq(Arr_1_Glob[8], 7); + dhry_assert_int_eq(Arr_2_Glob[8][7], Number_Of_Runs + 10); + pr_debug("Ptr_Comp: %px\n", Ptr_Glob->Ptr_Comp); + dhry_assert_int_eq(Ptr_Glob->Discr, 0); + dhry_assert_int_eq(Ptr_Glob->variant.var_1.Enum_Comp, 2); + dhry_assert_int_eq(Ptr_Glob->variant.var_1.Int_Comp, 17); + dhry_assert_string_eq(Ptr_Glob->variant.var_1.Str_Comp, + "DHRYSTONE PROGRAM, SOME STRING"); + if (Next_Ptr_Glob->Ptr_Comp != Ptr_Glob->Ptr_Comp) + pr_err("Next_Ptr_Glob->Ptr_Comp: %px (expected %px)\n", + Next_Ptr_Glob->Ptr_Comp, Ptr_Glob->Ptr_Comp); + else + pr_debug("Next_Ptr_Glob->Ptr_Comp: %px\n", + Next_Ptr_Glob->Ptr_Comp); + dhry_assert_int_eq(Next_Ptr_Glob->Discr, 0); + dhry_assert_int_eq(Next_Ptr_Glob->variant.var_1.Enum_Comp, 1); + dhry_assert_int_eq(Next_Ptr_Glob->variant.var_1.Int_Comp, 18); + dhry_assert_string_eq(Next_Ptr_Glob->variant.var_1.Str_Comp, + "DHRYSTONE PROGRAM, SOME STRING"); + dhry_assert_int_eq(Int_1_Loc, 5); + dhry_assert_int_eq(Int_2_Loc, 13); + dhry_assert_int_eq(Int_3_Loc, 7); + dhry_assert_int_eq(Enum_Loc, 1); + dhry_assert_string_eq(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); + dhry_assert_string_eq(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); + + User_Time = ktime_to_ms(ktime_sub(End_Time, Begin_Time)); + + kfree(Ptr_Glob); + kfree(Next_Ptr_Glob); + + /* Measurements should last at least 2 seconds */ + if (User_Time < 2 * MSEC_PER_SEC) + return -EAGAIN; + + return div_u64(mul_u32_u32(MSEC_PER_SEC, Number_Of_Runs), User_Time); +} diff --git a/lib/dhry_2.c b/lib/dhry_2.c new file mode 100644 index 0000000000000..c19e661f37d30 --- /dev/null +++ b/lib/dhry_2.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + **************************************************************************** + * + * "DHRYSTONE" Benchmark Program + * ----------------------------- + * + * Version: C, Version 2.1 + * + * File: dhry_2.c (part 3 of 3) + * + * Date: May 25, 1988 + * + * Author: Reinhold P. Weicker + * + **************************************************************************** + */ + +#include "dhry.h" + +#include + + +static Boolean Func_3(Enumeration Enum_Par_Val) +/***************************/ +/* executed once */ +/* Enum_Par_Val == Ident_3 */ +{ + Enumeration Enum_Loc; + + Enum_Loc = Enum_Par_Val; + if (Enum_Loc == Ident_3) { + /* then, executed */ + return true; + } else { + /* not executed */ + return false; + } +} /* Func_3 */ + + +void Proc_6(Enumeration Enum_Val_Par, Enumeration *Enum_Ref_Par) +/*********************************/ +/* executed once */ +/* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */ +{ + *Enum_Ref_Par = Enum_Val_Par; + if (!Func_3(Enum_Val_Par)) { + /* then, not executed */ + *Enum_Ref_Par = Ident_4; + } + switch (Enum_Val_Par) { + case Ident_1: + *Enum_Ref_Par = Ident_1; + break; + case Ident_2: + if (Int_Glob > 100) { + /* then */ + *Enum_Ref_Par = Ident_1; + } else { + *Enum_Ref_Par = Ident_4; + } + break; + case Ident_3: /* executed */ + *Enum_Ref_Par = Ident_2; + break; + case Ident_4: + break; + case Ident_5: + *Enum_Ref_Par = Ident_3; + break; + } /* switch */ +} /* Proc_6 */ + + +void Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref) +/**********************************************/ +/* executed three times */ +/* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */ +/* Int_Par_Ref becomes 7 */ +/* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */ +/* Int_Par_Ref becomes 17 */ +/* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */ +/* Int_Par_Ref becomes 18 */ +{ + One_Fifty Int_Loc; + + Int_Loc = Int_1_Par_Val + 2; + *Int_Par_Ref = Int_2_Par_Val + Int_Loc; +} /* Proc_7 */ + + +void Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref, int Int_1_Par_Val, int Int_2_Par_Val) +/*********************************************************************/ +/* executed once */ +/* Int_Par_Val_1 == 3 */ +/* Int_Par_Val_2 == 7 */ +{ + One_Fifty Int_Index; + One_Fifty Int_Loc; + + Int_Loc = Int_1_Par_Val + 5; + Arr_1_Par_Ref[Int_Loc] = Int_2_Par_Val; + Arr_1_Par_Ref[Int_Loc+1] = Arr_1_Par_Ref[Int_Loc]; + Arr_1_Par_Ref[Int_Loc+30] = Int_Loc; + for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index) + Arr_2_Par_Ref[Int_Loc][Int_Index] = Int_Loc; + Arr_2_Par_Ref[Int_Loc][Int_Loc-1] += 1; + Arr_2_Par_Ref[Int_Loc+20][Int_Loc] = Arr_1_Par_Ref[Int_Loc]; + Int_Glob = 5; +} /* Proc_8 */ + + +Enumeration Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val) +/*************************************************/ +/* executed three times */ +/* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */ +/* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */ +/* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */ +{ + Capital_Letter Ch_1_Loc; + Capital_Letter Ch_2_Loc; + + Ch_1_Loc = Ch_1_Par_Val; + Ch_2_Loc = Ch_1_Loc; + if (Ch_2_Loc != Ch_2_Par_Val) { + /* then, executed */ + return Ident_1; + } else { + /* not executed */ + Ch_1_Glob = Ch_1_Loc; + return Ident_2; + } +} /* Func_1 */ + + +Boolean Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref) +/*************************************************/ +/* executed once */ +/* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */ +/* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */ +{ + One_Thirty Int_Loc; + Capital_Letter Ch_Loc; + + Int_Loc = 2; + while (Int_Loc <= 2) { + /* loop body executed once */ + if (Func_1(Str_1_Par_Ref[Int_Loc], + Str_2_Par_Ref[Int_Loc+1]) == Ident_1) { + /* then, executed */ + Ch_Loc = 'A'; + Int_Loc += 1; + } + } /* if, while */ + if (Ch_Loc >= 'W' && Ch_Loc < 'Z') { + /* then, not executed */ + Int_Loc = 7; + } + if (Ch_Loc == 'R') { + /* then, not executed */ + return true; + } else { + /* executed */ + if (strcmp(Str_1_Par_Ref, Str_2_Par_Ref) > 0) { + /* then, not executed */ + Int_Loc += 7; + Int_Glob = Int_Loc; + return true; + } else { + /* executed */ + return false; + } + } /* if Ch_Loc */ +} /* Func_2 */ diff --git a/lib/dhry_run.c b/lib/dhry_run.c new file mode 100644 index 0000000000000..f9d33efa6d090 --- /dev/null +++ b/lib/dhry_run.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Dhrystone benchmark test module + * + * Copyright (C) 2022 Glider bv + */ + +#include "dhry.h" + +#include +#include +#include +#include +#include + +#define DHRY_VAX 1757 + +static int dhry_run_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops run_ops = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = dhry_run_set, +}; +static bool dhry_run; +module_param_cb(run, &run_ops, &dhry_run, 0200); +MODULE_PARM_DESC(run, "Run the test (default: false)"); + +static int iterations = -1; +module_param(iterations, int, 0644); +MODULE_PARM_DESC(iterations, + "Number of iterations through the benchmark (default: auto)"); + +static void dhry_benchmark(void) +{ + int i, n; + + if (iterations > 0) { + n = dhry(iterations); + goto report; + } + + for (i = DHRY_VAX; i > 0; i <<= 1) { + n = dhry(i); + if (n != -EAGAIN) + break; + } + +report: + if (n >= 0) + pr_info("CPU%u: Dhrystones per Second: %d (%d DMIPS)\n", + smp_processor_id(), n, n / DHRY_VAX); + else if (n == -EAGAIN) + pr_err("Please increase the number of iterations\n"); + else + pr_err("Dhrystone benchmark failed error %pe\n", ERR_PTR(n)); +} + +static int dhry_run_set(const char *val, const struct kernel_param *kp) +{ + int ret; + + if (val) { + ret = param_set_bool(val, kp); + if (ret) + return ret; + } else { + dhry_run = true; + } + + if (dhry_run && system_state == SYSTEM_RUNNING) + dhry_benchmark(); + + return 0; +} + +static int __init dhry_init(void) +{ + if (dhry_run) + dhry_benchmark(); + + return 0; +} +module_init(dhry_init); + +MODULE_AUTHOR("Geert Uytterhoeven "); +MODULE_LICENSE("GPL"); diff --git a/lib/error-inject.c b/lib/error-inject.c index 1afca1b1cdead..32c14770508ec 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -40,7 +40,7 @@ bool within_error_injection_list(unsigned long addr) int get_injectable_error_type(unsigned long addr) { struct ei_entry *ent; - int ei_type = EI_ETYPE_NONE; + int ei_type = -EINVAL; mutex_lock(&ei_mutex); list_for_each_entry(ent, &error_injection_list, list) { diff --git a/lib/genalloc.c b/lib/genalloc.c index 00fc50d0a640e..0c883d6fbd44d 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -40,32 +40,30 @@ static inline size_t chunk_size(const struct gen_pool_chunk *chunk) return chunk->end_addr - chunk->start_addr + 1; } -static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) +static inline int +set_bits_ll(unsigned long *addr, unsigned long mask_to_set) { - unsigned long val, nval; + unsigned long val = READ_ONCE(*addr); - nval = *addr; do { - val = nval; if (val & mask_to_set) return -EBUSY; cpu_relax(); - } while ((nval = cmpxchg(addr, val, val | mask_to_set)) != val); + } while (!try_cmpxchg(addr, &val, val | mask_to_set)); return 0; } -static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) +static inline int +clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) { - unsigned long val, nval; + unsigned long val = READ_ONCE(*addr); - nval = *addr; do { - val = nval; if ((val & mask_to_clear) != mask_to_clear) return -EBUSY; cpu_relax(); - } while ((nval = cmpxchg(addr, val, val & ~mask_to_clear)) != val); + } while (!try_cmpxchg(addr, &val, val & ~mask_to_clear)); return 0; } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index baff62a012e18..5e97031892590 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4655,13 +4655,13 @@ static inline void *mas_next_nentry(struct ma_state *mas, pivots = ma_pivots(node, type); slots = ma_slots(node, type); mas->index = mas_safe_min(mas, pivots, mas->offset); + count = ma_data_end(node, type, pivots, mas->max); if (ma_dead_node(node)) return NULL; if (mas->index > max) return NULL; - count = ma_data_end(node, type, pivots, mas->max); if (mas->offset > count) return NULL; @@ -4732,6 +4732,11 @@ static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) unsigned long last; enum maple_type mt; + if (mas->index > limit) { + mas->index = mas->last = limit; + mas_pause(mas); + return NULL; + } last = mas->last; retry: offset = mas->offset; @@ -4838,6 +4843,11 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) { void *entry; + if (mas->index < min) { + mas->index = mas->last = min; + mas->node = MAS_NONE; + return NULL; + } retry: while (likely(!mas_is_none(mas))) { entry = mas_prev_nentry(mas, min, mas->index); @@ -5579,8 +5589,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, /* * mte_destroy_walk() - Free a tree or sub-tree. - * @enode - the encoded maple node (maple_enode) to start - * @mn - the tree to free - needed for node types. + * @enode: the encoded maple node (maple_enode) to start + * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ @@ -5599,6 +5609,9 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (unlikely(mas_is_paused(wr_mas->mas))) + mas_reset(wr_mas->mas); + if (!mas_is_start(wr_mas->mas)) { if (mas_is_none(wr_mas->mas)) { mas_reset(wr_mas->mas); @@ -5700,12 +5713,11 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state - * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, gfp_t gfp) { int ret; @@ -5907,6 +5919,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min) if (!mas->index) { /* Nothing comes before 0 */ mas->last = 0; + mas->node = MAS_NONE; return NULL; } @@ -5997,6 +6010,9 @@ void *mas_find(struct ma_state *mas, unsigned long max) mas->index = ++mas->last; } + if (unlikely(mas_is_none(mas))) + mas->node = MAS_START; + if (unlikely(mas_is_start(mas))) { /* First run or continue */ void *entry; diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 42f729c8e56c4..dba56c5c18379 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -73,28 +73,33 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) EXPORT_SYMBOL(percpu_counter_set); /* - * This function is both preempt and irq safe. The former is due to explicit - * preemption disable. The latter is guaranteed by the fact that the slow path - * is explicitly protected by an irq-safe spinlock whereas the fast patch uses - * this_cpu_add which is irq-safe by definition. Hence there is no need muck - * with irq state before calling this one + * local_irq_save() is needed to make the function irq safe: + * - The slow path would be ok as protected by an irq-safe spinlock. + * - this_cpu_add would be ok as it is irq-safe by definition. + * But: + * The decision slow path/fast path and the actual update must be atomic, too. + * Otherwise a call in process context could check the current values and + * decide that the fast path can be used. If now an interrupt occurs before + * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), + * then the this_cpu_add() that is executed after the interrupt has completed + * can produce values larger than "batch" or even overflows. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; + unsigned long flags; - preempt_disable(); + local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { - unsigned long flags; - raw_spin_lock_irqsave(&fbc->lock, flags); + raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); - raw_spin_unlock_irqrestore(&fbc->lock, flags); + raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } - preempt_enable(); + local_irq_restore(flags); } EXPORT_SYMBOL(percpu_counter_add_batch); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 79e894cf84064..d2ab4d29fa1a3 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -1,24 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Generic stack depot for storing stack traces. + * Stack depot - a stack trace storage that avoids duplication. * - * Some debugging tools need to save stack traces of certain events which can - * be later presented to the user. For example, KASAN needs to safe alloc and - * free stacks for each object, but storing two stack traces per object - * requires too much memory (e.g. SLUB_DEBUG needs 256 bytes per object for - * that). - * - * Instead, stack depot maintains a hashtable of unique stacktraces. Since alloc - * and free stacks repeat a lot, we save about 100x space. - * Stacks are never removed from depot, so we store them contiguously one after - * another in a contiguous memory allocation. + * Internally, stack depot maintains a hash table of unique stacktraces. The + * stack traces themselves are stored contiguously one after another in a set + * of separate page allocations. * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * - * Based on code by Dmitry Chernenkov. + * Based on the code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "stackdepot: " fmt + #include #include #include @@ -34,243 +29,277 @@ #include #include -#define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) - -#define STACK_ALLOC_NULL_PROTECTION_BITS 1 -#define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */ -#define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER)) -#define STACK_ALLOC_ALIGN 4 -#define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ - STACK_ALLOC_ALIGN) -#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - \ - STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) -#define STACK_ALLOC_SLABS_CAP 8192 -#define STACK_ALLOC_MAX_SLABS \ - (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ - (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) - -/* The compact structure to store the reference to stacks. */ +#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) + +#define DEPOT_VALID_BITS 1 +#define DEPOT_SLAB_ORDER 2 /* Slab size order, 4 pages */ +#define DEPOT_SLAB_SIZE (1LL << (PAGE_SHIFT + DEPOT_SLAB_ORDER)) +#define DEPOT_STACK_ALIGN 4 +#define DEPOT_OFFSET_BITS (DEPOT_SLAB_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) +#define DEPOT_SLAB_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_VALID_BITS - \ + DEPOT_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) +#define DEPOT_SLABS_CAP 8192 +#define DEPOT_MAX_SLABS \ + (((1LL << (DEPOT_SLAB_INDEX_BITS)) < DEPOT_SLABS_CAP) ? \ + (1LL << (DEPOT_SLAB_INDEX_BITS)) : DEPOT_SLABS_CAP) + +/* Compact structure that stores a reference to a stack. */ union handle_parts { depot_stack_handle_t handle; struct { - u32 slabindex : STACK_ALLOC_INDEX_BITS; - u32 offset : STACK_ALLOC_OFFSET_BITS; - u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; - u32 extra : STACK_DEPOT_EXTRA_BITS; + u32 slab_index : DEPOT_SLAB_INDEX_BITS; + u32 offset : DEPOT_OFFSET_BITS; + u32 valid : DEPOT_VALID_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; struct stack_record { - struct stack_record *next; /* Link in the hashtable */ - u32 hash; /* Hash in the hastable */ - u32 size; /* Number of frames in the stack */ + struct stack_record *next; /* Link in the hash table */ + u32 hash; /* Hash in the hash table */ + u32 size; /* Number of stored frames */ union handle_parts handle; - unsigned long entries[]; /* Variable-sized array of entries. */ + unsigned long entries[]; /* Variable-sized array of frames */ }; -static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool stack_depot_disabled; +static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); static bool __stack_depot_early_init_passed __initdata; -static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; - -static int depot_index; -static int next_slab_inited; -static size_t depot_offset; -static DEFINE_RAW_SPINLOCK(depot_lock); - -unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) -{ - union handle_parts parts = { .handle = handle }; - - return parts.extra; -} -EXPORT_SYMBOL(stack_depot_get_extra_bits); - -static bool init_stack_slab(void **prealloc) -{ - if (!*prealloc) - return false; - /* - * This smp_load_acquire() pairs with smp_store_release() to - * |next_slab_inited| below and in depot_alloc_stack(). - */ - if (smp_load_acquire(&next_slab_inited)) - return true; - if (stack_slabs[depot_index] == NULL) { - stack_slabs[depot_index] = *prealloc; - *prealloc = NULL; - } else { - /* If this is the last depot slab, do not touch the next one. */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { - stack_slabs[depot_index + 1] = *prealloc; - *prealloc = NULL; - } - /* - * This smp_store_release pairs with smp_load_acquire() from - * |next_slab_inited| above and in stack_depot_save(). - */ - smp_store_release(&next_slab_inited, 1); - } - return true; -} - -/* Allocation of a new stack in raw storage */ -static struct stack_record * -depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) -{ - struct stack_record *stack; - size_t required_size = struct_size(stack, entries, size); - - required_size = ALIGN(required_size, 1 << STACK_ALLOC_ALIGN); - - if (unlikely(depot_offset + required_size > STACK_ALLOC_SIZE)) { - if (unlikely(depot_index + 1 >= STACK_ALLOC_MAX_SLABS)) { - WARN_ONCE(1, "Stack depot reached limit capacity"); - return NULL; - } - depot_index++; - depot_offset = 0; - /* - * smp_store_release() here pairs with smp_load_acquire() from - * |next_slab_inited| in stack_depot_save() and - * init_stack_slab(). - */ - if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) - smp_store_release(&next_slab_inited, 0); - } - init_stack_slab(prealloc); - if (stack_slabs[depot_index] == NULL) - return NULL; - - stack = stack_slabs[depot_index] + depot_offset; - - stack->hash = hash; - stack->size = size; - stack->handle.slabindex = depot_index; - stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; - stack->handle.valid = 1; - stack->handle.extra = 0; - memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); - depot_offset += required_size; - - return stack; -} - -/* one hash table bucket entry per 16kB of memory */ -#define STACK_HASH_SCALE 14 -/* limited between 4k and 1M buckets */ -#define STACK_HASH_ORDER_MIN 12 -#define STACK_HASH_ORDER_MAX 20 +/* Use one hash table bucket per 16 KB of memory. */ +#define STACK_TABLE_SCALE 14 +/* Limit the number of buckets between 4K and 1M. */ +#define STACK_BUCKET_NUMBER_ORDER_MIN 12 +#define STACK_BUCKET_NUMBER_ORDER_MAX 20 +/* Initial seed for jhash2. */ #define STACK_HASH_SEED 0x9747b28c -static unsigned int stack_hash_order; +/* Hash table of pointers to stored stack traces. */ +static struct stack_record **stack_table; +/* Fixed order of the number of table buckets. Used when KASAN is enabled. */ +static unsigned int stack_bucket_number_order; +/* Hash mask for indexing the table. */ static unsigned int stack_hash_mask; -static bool stack_depot_disable; -static struct stack_record **stack_table; +/* Array of memory regions that store stack traces. */ +static void *stack_slabs[DEPOT_MAX_SLABS]; +/* Currently used slab in stack_slabs. */ +static int slab_index; +/* Offset to the unused space in the currently used slab. */ +static size_t slab_offset; +/* Lock that protects the variables above. */ +static DEFINE_RAW_SPINLOCK(slab_lock); +/* Whether the next slab is initialized. */ +static int next_slab_inited; -static int __init is_stack_depot_disabled(char *str) +static int __init disable_stack_depot(char *str) { int ret; - ret = kstrtobool(str, &stack_depot_disable); - if (!ret && stack_depot_disable) { - pr_info("Stack Depot is disabled\n"); + ret = kstrtobool(str, &stack_depot_disabled); + if (!ret && stack_depot_disabled) { + pr_info("disabled\n"); stack_table = NULL; } return 0; } -early_param("stack_depot_disable", is_stack_depot_disabled); +early_param("stack_depot_disable", disable_stack_depot); -void __init stack_depot_want_early_init(void) +void __init stack_depot_request_early_init(void) { - /* Too late to request early init now */ + /* Too late to request early init now. */ WARN_ON(__stack_depot_early_init_passed); - __stack_depot_want_early_init = true; + __stack_depot_early_init_requested = true; } +/* Allocates a hash table via memblock. Can only be used during early boot. */ int __init stack_depot_early_init(void) { unsigned long entries = 0; - /* This is supposed to be called only once, from mm_init() */ + /* This function must be called only once, from mm_init(). */ if (WARN_ON(__stack_depot_early_init_passed)) return 0; - __stack_depot_early_init_passed = true; - if (kasan_enabled() && !stack_hash_order) - stack_hash_order = STACK_HASH_ORDER_MAX; + /* + * If KASAN is enabled, use the maximum order: KASAN is frequently used + * in fuzzing scenarios, which leads to a large number of different + * stack traces being stored in stack depot. + */ + if (kasan_enabled() && !stack_bucket_number_order) + stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; - if (!__stack_depot_want_early_init || stack_depot_disable) + if (!__stack_depot_early_init_requested || stack_depot_disabled) return 0; - if (stack_hash_order) - entries = 1UL << stack_hash_order; + /* + * If stack_bucket_number_order is not set, leave entries as 0 to rely + * on the automatic calculations performed by alloc_large_system_hash. + */ + if (stack_bucket_number_order) + entries = 1UL << stack_bucket_number_order; + pr_info("allocating hash table via alloc_large_system_hash\n"); stack_table = alloc_large_system_hash("stackdepot", sizeof(struct stack_record *), entries, - STACK_HASH_SCALE, + STACK_TABLE_SCALE, HASH_EARLY | HASH_ZERO, NULL, &stack_hash_mask, - 1UL << STACK_HASH_ORDER_MIN, - 1UL << STACK_HASH_ORDER_MAX); - + 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, + 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); - stack_depot_disable = true; + pr_err("hash table allocation failed, disabling\n"); + stack_depot_disabled = true; return -ENOMEM; } return 0; } +/* Allocates a hash table via kvmalloc. Can be used after boot. */ int stack_depot_init(void) { static DEFINE_MUTEX(stack_depot_init_mutex); + unsigned long entries; int ret = 0; mutex_lock(&stack_depot_init_mutex); - if (!stack_depot_disable && !stack_table) { - unsigned long entries; - int scale = STACK_HASH_SCALE; - - if (stack_hash_order) { - entries = 1UL << stack_hash_order; - } else { - entries = nr_free_buffer_pages(); - entries = roundup_pow_of_two(entries); - - if (scale > PAGE_SHIFT) - entries >>= (scale - PAGE_SHIFT); - else - entries <<= (PAGE_SHIFT - scale); - } - if (entries < 1UL << STACK_HASH_ORDER_MIN) - entries = 1UL << STACK_HASH_ORDER_MIN; - if (entries > 1UL << STACK_HASH_ORDER_MAX) - entries = 1UL << STACK_HASH_ORDER_MAX; - - pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", - entries); - stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); - if (!stack_table) { - pr_err("Stack Depot hash table allocation failed, disabling\n"); - stack_depot_disable = true; - ret = -ENOMEM; - } - stack_hash_mask = entries - 1; + if (stack_depot_disabled || stack_table) + goto out_unlock; + + /* + * Similarly to stack_depot_early_init, use stack_bucket_number_order + * if assigned, and rely on automatic scaling otherwise. + */ + if (stack_bucket_number_order) { + entries = 1UL << stack_bucket_number_order; + } else { + int scale = STACK_TABLE_SCALE; + + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); + } + + if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; + if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) + entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; + + pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { + pr_err("hash table allocation failed, disabling\n"); + stack_depot_disabled = true; + ret = -ENOMEM; + goto out_unlock; } + stack_hash_mask = entries - 1; + +out_unlock: mutex_unlock(&stack_depot_init_mutex); + return ret; } EXPORT_SYMBOL_GPL(stack_depot_init); -/* Calculate hash for a stack */ +/* Uses preallocated memory to initialize a new stack depot slab. */ +static void depot_init_slab(void **prealloc) +{ + /* + * If the next slab is already initialized, do not use the + * preallocated memory. + * smp_load_acquire() here pairs with smp_store_release() below and + * in depot_alloc_stack(). + */ + if (smp_load_acquire(&next_slab_inited)) + return; + + /* Check if the current slab is not yet allocated. */ + if (stack_slabs[slab_index] == NULL) { + /* Use the preallocated memory for the current slab. */ + stack_slabs[slab_index] = *prealloc; + *prealloc = NULL; + } else { + /* + * Otherwise, use the preallocated memory for the next slab + * as long as we do not exceed the maximum number of slabs. + */ + if (slab_index + 1 < DEPOT_MAX_SLABS) { + stack_slabs[slab_index + 1] = *prealloc; + *prealloc = NULL; + /* + * This smp_store_release pairs with smp_load_acquire() + * above and in stack_depot_save(). + */ + smp_store_release(&next_slab_inited, 1); + } + } +} + +/* Allocates a new stack in a stack depot slab. */ +static struct stack_record * +depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) +{ + struct stack_record *stack; + size_t required_size = struct_size(stack, entries, size); + + required_size = ALIGN(required_size, 1 << DEPOT_STACK_ALIGN); + + /* Check if there is not enough space in the current slab. */ + if (unlikely(slab_offset + required_size > DEPOT_SLAB_SIZE)) { + /* Bail out if we reached the slab limit. */ + if (unlikely(slab_index + 1 >= DEPOT_MAX_SLABS)) { + WARN_ONCE(1, "Stack depot reached limit capacity"); + return NULL; + } + + /* + * Move on to the next slab. + * WRITE_ONCE pairs with potential concurrent read in + * stack_depot_fetch(). + */ + WRITE_ONCE(slab_index, slab_index + 1); + slab_offset = 0; + /* + * smp_store_release() here pairs with smp_load_acquire() in + * stack_depot_save() and depot_init_slab(). + */ + if (slab_index + 1 < DEPOT_MAX_SLABS) + smp_store_release(&next_slab_inited, 0); + } + + /* Assign the preallocated memory to a slab if required. */ + if (*prealloc) + depot_init_slab(prealloc); + + /* Check if we have a slab to save the stack trace. */ + if (stack_slabs[slab_index] == NULL) + return NULL; + + /* Save the stack trace. */ + stack = stack_slabs[slab_index] + slab_offset; + stack->hash = hash; + stack->size = size; + stack->handle.slab_index = slab_index; + stack->handle.offset = slab_offset >> DEPOT_STACK_ALIGN; + stack->handle.valid = 1; + stack->handle.extra = 0; + memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); + slab_offset += required_size; + + return stack; +} + +/* Calculates the hash for a stack. */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) { return jhash2((u32 *)entries, @@ -278,9 +307,9 @@ static inline u32 hash_stack(unsigned long *entries, unsigned int size) STACK_HASH_SEED); } -/* Use our own, non-instrumented version of memcmp(). - * - * We actually don't care about the order, just the equality. +/* + * Non-instrumented version of memcmp(). + * Does not check the lexicographical order, only the equality. */ static inline int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, @@ -293,7 +322,7 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, return 0; } -/* Find a stack that is equal to the one stored in entries in the hash */ +/* Finds a stack in a bucket of the hash table. */ static inline struct stack_record *find_stack(struct stack_record *bucket, unsigned long *entries, int size, u32 hash) @@ -309,116 +338,8 @@ static inline struct stack_record *find_stack(struct stack_record *bucket, return NULL; } -/** - * stack_depot_snprint - print stack entries from a depot into a buffer - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @buf: Pointer to the print buffer - * - * @size: Size of the print buffer - * - * @spaces: Number of leading spaces to print - * - * Return: Number of bytes printed. - */ -int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, - int spaces) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(handle, &entries); - return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, - spaces) : 0; -} -EXPORT_SYMBOL_GPL(stack_depot_snprint); - -/** - * stack_depot_print - print stack entries from a depot - * - * @stack: Stack depot handle which was returned from - * stack_depot_save(). - * - */ -void stack_depot_print(depot_stack_handle_t stack) -{ - unsigned long *entries; - unsigned int nr_entries; - - nr_entries = stack_depot_fetch(stack, &entries); - if (nr_entries > 0) - stack_trace_print(entries, nr_entries, 0); -} -EXPORT_SYMBOL_GPL(stack_depot_print); - -/** - * stack_depot_fetch - Fetch stack entries from a depot - * - * @handle: Stack depot handle which was returned from - * stack_depot_save(). - * @entries: Pointer to store the entries address - * - * Return: The number of trace entries for this depot. - */ -unsigned int stack_depot_fetch(depot_stack_handle_t handle, - unsigned long **entries) -{ - union handle_parts parts = { .handle = handle }; - void *slab; - size_t offset = parts.offset << STACK_ALLOC_ALIGN; - struct stack_record *stack; - - *entries = NULL; - if (!handle) - return 0; - - if (parts.slabindex > depot_index) { - WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", - parts.slabindex, depot_index, handle); - return 0; - } - slab = stack_slabs[parts.slabindex]; - if (!slab) - return 0; - stack = slab + offset; - - *entries = stack->entries; - return stack->size; -} -EXPORT_SYMBOL_GPL(stack_depot_fetch); - -/** - * __stack_depot_save - Save a stack trace from an array - * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @extra_bits: Flags to store in unused bits of depot_stack_handle_t - * @alloc_flags: Allocation gfp flags - * @can_alloc: Allocate stack slabs (increased chance of failure if false) - * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, is allowed to replenish the stack slab pool in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and will fail if no space is left to store the stack trace. - * - * If the stack trace in @entries is from an interrupt, only the portion up to - * interrupt entry is saved. - * - * Additional opaque flags can be passed in @extra_bits, stored in the unused - * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() - * without calling stack_depot_fetch(). - * - * Context: Any context, but setting @can_alloc to %false is required if - * alloc_pages() cannot be used from the current context. Currently - * this is the case from contexts where neither %GFP_ATOMIC nor - * %GFP_NOWAIT can be used (NMI, raw_spin_lock). - * - * Return: The handle of the stack struct stored in depot, 0 on failure. - */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, - unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; @@ -430,15 +351,15 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, /* * If this stack trace is from an interrupt, including anything before - * interrupt entry usually leads to unbounded stackdepot growth. + * interrupt entry usually leads to unbounded stack depot growth. * - * Because use of filter_irq_stacks() is a requirement to ensure - * stackdepot can efficiently deduplicate interrupt stacks, always - * filter_irq_stacks() to simplify all callers' use of stackdepot. + * Since use of filter_irq_stacks() is a requirement to ensure stack + * depot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stack depot. */ nr_entries = filter_irq_stacks(entries, nr_entries); - if (unlikely(nr_entries == 0) || stack_depot_disable) + if (unlikely(nr_entries == 0) || stack_depot_disabled) goto fast_exit; hash = hash_stack(entries, nr_entries); @@ -449,8 +370,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * The smp_load_acquire() here pairs with smp_store_release() to * |bucket| below. */ - found = find_stack(smp_load_acquire(bucket), entries, - nr_entries, hash); + found = find_stack(smp_load_acquire(bucket), entries, nr_entries, hash); if (found) goto exit; @@ -460,7 +380,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * lock. * * The smp_load_acquire() here pairs with smp_store_release() to - * |next_slab_inited| in depot_alloc_stack() and init_stack_slab(). + * |next_slab_inited| in depot_alloc_stack() and depot_init_slab(). */ if (unlikely(can_alloc && !smp_load_acquire(&next_slab_inited))) { /* @@ -471,16 +391,17 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, alloc_flags &= ~GFP_ZONEMASK; alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); alloc_flags |= __GFP_NOWARN; - page = alloc_pages(alloc_flags, STACK_ALLOC_ORDER); + page = alloc_pages(alloc_flags, DEPOT_SLAB_ORDER); if (page) prealloc = page_address(page); } - raw_spin_lock_irqsave(&depot_lock, flags); + raw_spin_lock_irqsave(&slab_lock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { - struct stack_record *new = depot_alloc_stack(entries, nr_entries, hash, &prealloc); + struct stack_record *new = + depot_alloc_stack(entries, nr_entries, hash, &prealloc); if (new) { new->next = *bucket; @@ -493,43 +414,99 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, } } else if (prealloc) { /* - * We didn't need to store this stack trace, but let's keep - * the preallocated memory for the future. + * Stack depot already contains this stack trace, but let's + * keep the preallocated memory for the future. */ - WARN_ON(!init_stack_slab(&prealloc)); + depot_init_slab(&prealloc); } - raw_spin_unlock_irqrestore(&depot_lock, flags); + raw_spin_unlock_irqrestore(&slab_lock, flags); exit: if (prealloc) { - /* Nobody used this memory, ok to free it. */ - free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); + /* Stack depot didn't use this memory, free it. */ + free_pages((unsigned long)prealloc, DEPOT_SLAB_ORDER); } if (found) retval.handle = found->handle.handle; fast_exit: - retval.extra = extra_bits; - return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); -/** - * stack_depot_save - Save a stack trace from an array - * - * @entries: Pointer to storage array - * @nr_entries: Size of the storage array - * @alloc_flags: Allocation gfp flags - * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. - * - * Return: The handle of the stack struct stored in depot, 0 on failure. - */ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); + +unsigned int stack_depot_fetch(depot_stack_handle_t handle, + unsigned long **entries) +{ + union handle_parts parts = { .handle = handle }; + /* READ_ONCE annotates a race with depot_alloc_stack. */ + int slab_index_cached = READ_ONCE(slab_index); + void *slab; + size_t offset = parts.offset << DEPOT_STACK_ALIGN; + struct stack_record *stack; + + *entries = NULL; + if (!handle) + return 0; + + if (parts.slab_index > slab_index_cached) { + WARN(1, "slab index %d out of bounds (%d) for stack id %08x\n", + parts.slab_index, slab_index_cached, handle); + return 0; + } + slab = stack_slabs[parts.slab_index]; + if (!slab) + return 0; + stack = slab + offset; + + *entries = stack->entries; + return stack->size; +} +EXPORT_SYMBOL_GPL(stack_depot_fetch); + +void stack_depot_print(depot_stack_handle_t stack) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(stack, &entries); + if (nr_entries > 0) + stack_trace_print(entries, nr_entries, 0); +} +EXPORT_SYMBOL_GPL(stack_depot_print); + +int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, + int spaces) +{ + unsigned long *entries; + unsigned int nr_entries; + + nr_entries = stack_depot_fetch(handle, &entries); + return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, + spaces) : 0; +} +EXPORT_SYMBOL_GPL(stack_depot_snprint); + +depot_stack_handle_t stack_depot_set_extra_bits(depot_stack_handle_t handle, + unsigned int extra_bits) +{ + union handle_parts parts = { .handle = handle }; + + parts.extra = extra_bits; + return parts.handle; +} +EXPORT_SYMBOL(stack_depot_set_extra_bits); + +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index ec847bf4dcb4d..3d19b1f78d718 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1709,6 +1709,74 @@ static noinline void check_forking(struct maple_tree *mt) mtree_destroy(&newmt); } +static noinline void check_iteration(struct maple_tree *mt) +{ + int i, nr_entries = 125; + void *val; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i * 10, i * 10 + 9, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + + i = 0; + mas_lock(&mas); + mas_for_each(&mas, val, 925) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 92 */ + if (i == 92) { + mas.index = 925; + mas.last = 929; + mas_store(&mas, val); + } + i++; + } + /* Ensure mas_find() gets the next value */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 785) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite start of entry 78 */ + if (i == 78) { + mas.index = 780; + mas.last = 785; + mas_store(&mas, val); + } else { + i++; + } + } + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(i)); + + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 765) { + MT_BUG_ON(mt, mas.index != i * 10); + MT_BUG_ON(mt, mas.last != i * 10 + 9); + /* Overwrite end of entry 76 and advance to the end */ + if (i == 76) { + mas.index = 760; + mas.last = 765; + mas_store(&mas, val); + mas_next(&mas, ULONG_MAX); + } + i++; + } + /* Make sure the next find returns the one after 765, 766-769 */ + val = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != xa_mk_value(76)); + mas_unlock(&mas); + mas_destroy(&mas); + mt_set_non_kernel(0); +} + static noinline void check_mas_store_gfp(struct maple_tree *mt) { @@ -2659,6 +2727,10 @@ static int maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_iteration(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_forking(&tree); mtree_destroy(&tree); diff --git a/lib/test_printf.c b/lib/test_printf.c index d34dc636b81c7..46b4e6c414a35 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -674,17 +674,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index 8a878d0d892ce..3a1d8d34182e4 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -54,7 +54,7 @@ /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC -# include "../zlib_dfltcc/dfltcc.h" +# include "../zlib_dfltcc/dfltcc_deflate.h" #else #define DEFLATE_RESET_HOOK(strm) do {} while (0) #define DEFLATE_HOOK(strm, flush, bstate) 0 @@ -106,7 +106,7 @@ typedef struct deflate_workspace { deflate_state deflate_memory; #ifdef CONFIG_ZLIB_DFLTCC /* State memory for s390 hardware deflate */ - struct dfltcc_state dfltcc_memory; + struct dfltcc_deflate_state dfltcc_memory; #endif Byte *window_memory; Pos *prev_memory; @@ -451,17 +451,24 @@ int zlib_deflate( Assert(strm->avail_out > 0, "bug2"); if (flush != Z_FINISH) return Z_OK; - if (s->noheader) return Z_STREAM_END; - /* Write the zlib trailer (adler32) */ - putShortMSB(s, (uInt)(strm->adler >> 16)); - putShortMSB(s, (uInt)(strm->adler & 0xffff)); + if (!s->noheader) { + /* Write zlib trailer (adler32) */ + putShortMSB(s, (uInt)(strm->adler >> 16)); + putShortMSB(s, (uInt)(strm->adler & 0xffff)); + } flush_pending(strm); /* If avail_out is zero, the application will call deflate again * to flush the rest. */ - s->noheader = -1; /* write the trailer only once! */ - return s->pending != 0 ? Z_OK : Z_STREAM_END; + if (!s->noheader) { + s->noheader = -1; /* write the trailer only once! */ + } + if (s->pending == 0) { + Assert(s->bi_valid == 0, "bi_buf not flushed"); + return Z_STREAM_END; + } + return Z_OK; } /* ========================================================================= */ diff --git a/lib/zlib_dfltcc/dfltcc.c b/lib/zlib_dfltcc/dfltcc.c index 782f76e9d4dab..ac6ac9739f5b8 100644 --- a/lib/zlib_dfltcc/dfltcc.c +++ b/lib/zlib_dfltcc/dfltcc.c @@ -23,37 +23,18 @@ char *oesc_msg( } } -void dfltcc_reset( - z_streamp strm, - uInt size -) -{ - struct dfltcc_state *dfltcc_state = - (struct dfltcc_state *)((char *)strm->state + size); - struct dfltcc_qaf_param *param = - (struct dfltcc_qaf_param *)&dfltcc_state->param; - +void dfltcc_reset_state(struct dfltcc_state *dfltcc_state) { /* Initialize available functions */ if (is_dfltcc_enabled()) { - dfltcc(DFLTCC_QAF, param, NULL, NULL, NULL, NULL, NULL); - memmove(&dfltcc_state->af, param, sizeof(dfltcc_state->af)); + dfltcc(DFLTCC_QAF, &dfltcc_state->param, NULL, NULL, NULL, NULL, NULL); + memmove(&dfltcc_state->af, &dfltcc_state->param, sizeof(dfltcc_state->af)); } else memset(&dfltcc_state->af, 0, sizeof(dfltcc_state->af)); /* Initialize parameter block */ memset(&dfltcc_state->param, 0, sizeof(dfltcc_state->param)); dfltcc_state->param.nt = 1; - - /* Initialize tuning parameters */ - if (zlib_dfltcc_support == ZLIB_DFLTCC_FULL_DEBUG) - dfltcc_state->level_mask = DFLTCC_LEVEL_MASK_DEBUG; - else - dfltcc_state->level_mask = DFLTCC_LEVEL_MASK; - dfltcc_state->block_size = DFLTCC_BLOCK_SIZE; - dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE; - dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE; dfltcc_state->param.ribm = DFLTCC_RIBM; } -EXPORT_SYMBOL(dfltcc_reset); MODULE_LICENSE("GPL"); diff --git a/lib/zlib_dfltcc/dfltcc.h b/lib/zlib_dfltcc/dfltcc.h index 2a2fac1d050a2..b96232bdd44d2 100644 --- a/lib/zlib_dfltcc/dfltcc.h +++ b/lib/zlib_dfltcc/dfltcc.h @@ -93,63 +93,32 @@ static_assert(sizeof(struct dfltcc_param_v0) == 1536); struct dfltcc_state { struct dfltcc_param_v0 param; /* Parameter block */ struct dfltcc_qaf_param af; /* Available functions */ + char msg[64]; /* Buffer for strm->msg */ +}; + +/* + * Extension of inflate_state and deflate_state for DFLTCC. + */ +struct dfltcc_deflate_state { + struct dfltcc_state common; /* Parameter block */ uLong level_mask; /* Levels on which to use DFLTCC */ uLong block_size; /* New block each X bytes */ uLong block_threshold; /* New block after total_in > X */ uLong dht_threshold; /* New block only if avail_in >= X */ - char msg[64]; /* Buffer for strm->msg */ }; +#define ALIGN_UP(p, size) (__typeof__(p))(((uintptr_t)(p) + ((size) - 1)) & ~((size) - 1)) /* Resides right after inflate_state or deflate_state */ -#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((state) + 1)) - -/* External functions */ -int dfltcc_can_deflate(z_streamp strm); -int dfltcc_deflate(z_streamp strm, - int flush, - block_state *result); -void dfltcc_reset(z_streamp strm, uInt size); -int dfltcc_can_inflate(z_streamp strm); -typedef enum { - DFLTCC_INFLATE_CONTINUE, - DFLTCC_INFLATE_BREAK, - DFLTCC_INFLATE_SOFTWARE, -} dfltcc_inflate_action; -dfltcc_inflate_action dfltcc_inflate(z_streamp strm, - int flush, int *ret); +#define GET_DFLTCC_STATE(state) ((struct dfltcc_state *)((char *)(state) + ALIGN_UP(sizeof(*state), 8))) + +void dfltcc_reset_state(struct dfltcc_state *dfltcc_state); + static inline int is_dfltcc_enabled(void) { return (zlib_dfltcc_support != ZLIB_DFLTCC_DISABLED && test_facility(DFLTCC_FACILITY)); } -#define DEFLATE_RESET_HOOK(strm) \ - dfltcc_reset((strm), sizeof(deflate_state)) - -#define DEFLATE_HOOK dfltcc_deflate - -#define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm))) - #define DEFLATE_DFLTCC_ENABLED() is_dfltcc_enabled() -#define INFLATE_RESET_HOOK(strm) \ - dfltcc_reset((strm), sizeof(struct inflate_state)) - -#define INFLATE_TYPEDO_HOOK(strm, flush) \ - if (dfltcc_can_inflate((strm))) { \ - dfltcc_inflate_action action; \ -\ - RESTORE(); \ - action = dfltcc_inflate((strm), (flush), &ret); \ - LOAD(); \ - if (action == DFLTCC_INFLATE_CONTINUE) \ - break; \ - else if (action == DFLTCC_INFLATE_BREAK) \ - goto inf_leave; \ - } - -#define INFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_inflate((strm))) - -#define INFLATE_NEED_UPDATEWINDOW(strm) (!dfltcc_can_inflate((strm))) - #endif /* DFLTCC_H */ diff --git a/lib/zlib_dfltcc/dfltcc_deflate.c b/lib/zlib_dfltcc/dfltcc_deflate.c index 6c946e8532eec..b732b6d9e35d6 100644 --- a/lib/zlib_dfltcc/dfltcc_deflate.c +++ b/lib/zlib_dfltcc/dfltcc_deflate.c @@ -2,11 +2,13 @@ #include "../zlib_deflate/defutil.h" #include "dfltcc_util.h" -#include "dfltcc.h" +#include "dfltcc_deflate.h" #include #include #include +#define GET_DFLTCC_DEFLATE_STATE(state) ((struct dfltcc_deflate_state *)GET_DFLTCC_STATE(state)) + /* * Compress. */ @@ -15,7 +17,7 @@ int dfltcc_can_deflate( ) { deflate_state *state = (deflate_state *)strm->state; - struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state); + struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state); /* Check for kernel dfltcc command line parameter */ if (zlib_dfltcc_support == ZLIB_DFLTCC_DISABLED || @@ -28,22 +30,39 @@ int dfltcc_can_deflate( return 0; /* Unsupported hardware */ - if (!is_bit_set(dfltcc_state->af.fns, DFLTCC_GDHT) || - !is_bit_set(dfltcc_state->af.fns, DFLTCC_CMPR) || - !is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0)) + if (!is_bit_set(dfltcc_state->common.af.fns, DFLTCC_GDHT) || + !is_bit_set(dfltcc_state->common.af.fns, DFLTCC_CMPR) || + !is_bit_set(dfltcc_state->common.af.fmts, DFLTCC_FMT0)) return 0; return 1; } EXPORT_SYMBOL(dfltcc_can_deflate); +void dfltcc_reset_deflate_state(z_streamp strm) { + deflate_state *state = (deflate_state *)strm->state; + struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state); + + dfltcc_reset_state(&dfltcc_state->common); + + /* Initialize tuning parameters */ + if (zlib_dfltcc_support == ZLIB_DFLTCC_FULL_DEBUG) + dfltcc_state->level_mask = DFLTCC_LEVEL_MASK_DEBUG; + else + dfltcc_state->level_mask = DFLTCC_LEVEL_MASK; + dfltcc_state->block_size = DFLTCC_BLOCK_SIZE; + dfltcc_state->block_threshold = DFLTCC_FIRST_FHT_BLOCK_SIZE; + dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE; +} +EXPORT_SYMBOL(dfltcc_reset_deflate_state); + static void dfltcc_gdht( z_streamp strm ) { deflate_state *state = (deflate_state *)strm->state; struct dfltcc_param_v0 *param = &GET_DFLTCC_STATE(state)->param; - size_t avail_in = avail_in = strm->avail_in; + size_t avail_in = strm->avail_in; dfltcc(DFLTCC_GDHT, param, NULL, NULL, @@ -104,39 +123,46 @@ int dfltcc_deflate( ) { deflate_state *state = (deflate_state *)strm->state; - struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state); - struct dfltcc_param_v0 *param = &dfltcc_state->param; + struct dfltcc_deflate_state *dfltcc_state = GET_DFLTCC_DEFLATE_STATE(state); + struct dfltcc_param_v0 *param = &dfltcc_state->common.param; uInt masked_avail_in; dfltcc_cc cc; int need_empty_block; int soft_bcc; int no_flush; - if (!dfltcc_can_deflate(strm)) + if (!dfltcc_can_deflate(strm)) { + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; return 0; + } again: masked_avail_in = 0; soft_bcc = 0; no_flush = flush == Z_NO_FLUSH; - /* Trailing empty block. Switch to software, except when Continuation Flag - * is set, which means that DFLTCC has buffered some output in the - * parameter block and needs to be called again in order to flush it. + /* No input data. Return, except when Continuation Flag is set, which means + * that DFLTCC has buffered some output in the parameter block and needs to + * be called again in order to flush it. */ - if (flush == Z_FINISH && strm->avail_in == 0 && !param->cf) { - if (param->bcf) { - /* A block is still open, and the hardware does not support closing - * blocks without adding data. Thus, close it manually. - */ + if (strm->avail_in == 0 && !param->cf) { + /* A block is still open, and the hardware does not support closing + * blocks without adding data. Thus, close it manually. + */ + if (!no_flush && param->bcf) { send_eobs(strm, param); param->bcf = 0; } - return 0; - } - - if (strm->avail_in == 0 && !param->cf) { - *result = need_more; + /* Let one of deflate_* functions write a trailing empty block. */ + if (flush == Z_FINISH) + return 0; + /* Clear history. */ + if (flush == Z_FULL_FLUSH) + param->hl = 0; + /* Trigger block post-processing if necessary. */ + *result = no_flush ? need_more : block_done; return 1; } @@ -163,13 +189,18 @@ int dfltcc_deflate( param->bcf = 0; dfltcc_state->block_threshold = strm->total_in + dfltcc_state->block_size; - if (strm->avail_out == 0) { - *result = need_more; - return 1; - } } } + /* No space for compressed data. If we proceed, dfltcc_cmpr() will return + * DFLTCC_CC_OP1_TOO_SHORT without buffering header bits, but we will still + * set BCF=1, which is wrong. Avoid complications and return early. + */ + if (strm->avail_out == 0) { + *result = need_more; + return 1; + } + /* The caller gave us too much data. Pass only one block worth of * uncompressed data to DFLTCC and mask the rest, so that on the next * iteration we start a new block. @@ -189,7 +220,7 @@ int dfltcc_deflate( param->cvt = CVT_ADLER32; if (!no_flush) /* We need to close a block. Always do this in software - when there is - * no input data, the hardware will not nohor BCC. */ + * no input data, the hardware will not hohor BCC. */ soft_bcc = 1; if (flush == Z_FINISH && !param->bcf) /* We are about to open a BFINAL block, set Block Header Final bit @@ -204,8 +235,8 @@ int dfltcc_deflate( param->sbb = (unsigned int)state->bi_valid; if (param->sbb > 0) *strm->next_out = (Byte)state->bi_buf; - if (param->hl) - param->nt = 0; /* Honor history */ + /* Honor history and check value */ + param->nt = 0; param->cv = strm->adler; /* When opening a block, choose a Huffman-Table Type */ @@ -232,7 +263,7 @@ int dfltcc_deflate( } while (cc == DFLTCC_CC_AGAIN); /* Translate parameter block to stream */ - strm->msg = oesc_msg(dfltcc_state->msg, param->oesc); + strm->msg = oesc_msg(dfltcc_state->common.msg, param->oesc); state->bi_valid = param->sbb; if (state->bi_valid == 0) state->bi_buf = 0; /* Avoid accessing next_out */ diff --git a/lib/zlib_dfltcc/dfltcc_deflate.h b/lib/zlib_dfltcc/dfltcc_deflate.h new file mode 100644 index 0000000000000..be44b43833b1c --- /dev/null +++ b/lib/zlib_dfltcc/dfltcc_deflate.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Zlib +#ifndef DFLTCC_DEFLATE_H +#define DFLTCC_DEFLATE_H + +#include "dfltcc.h" + +/* External functions */ +int dfltcc_can_deflate(z_streamp strm); +int dfltcc_deflate(z_streamp strm, + int flush, + block_state *result); +void dfltcc_reset_deflate_state(z_streamp strm); + +#define DEFLATE_RESET_HOOK(strm) \ + dfltcc_reset_deflate_state((strm)) + +#define DEFLATE_HOOK dfltcc_deflate + +#define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm))) + +#endif /* DFLTCC_DEFLATE_H */ diff --git a/lib/zlib_dfltcc/dfltcc_inflate.c b/lib/zlib_dfltcc/dfltcc_inflate.c index fb60b5a6a1cb6..437cd34c84906 100644 --- a/lib/zlib_dfltcc/dfltcc_inflate.c +++ b/lib/zlib_dfltcc/dfltcc_inflate.c @@ -2,7 +2,7 @@ #include "../zlib_inflate/inflate.h" #include "dfltcc_util.h" -#include "dfltcc.h" +#include "dfltcc_inflate.h" #include #include #include @@ -22,16 +22,20 @@ int dfltcc_can_inflate( zlib_dfltcc_support == ZLIB_DFLTCC_DEFLATE_ONLY) return 0; - /* Unsupported compression settings */ - if (state->wbits != HB_BITS) - return 0; - /* Unsupported hardware */ return is_bit_set(dfltcc_state->af.fns, DFLTCC_XPND) && is_bit_set(dfltcc_state->af.fmts, DFLTCC_FMT0); } EXPORT_SYMBOL(dfltcc_can_inflate); +void dfltcc_reset_inflate_state(z_streamp strm) { + struct inflate_state *state = (struct inflate_state *)strm->state; + struct dfltcc_state *dfltcc_state = GET_DFLTCC_STATE(state); + + dfltcc_reset_state(dfltcc_state); +} +EXPORT_SYMBOL(dfltcc_reset_inflate_state); + static int dfltcc_was_inflate_used( z_streamp strm ) @@ -91,8 +95,10 @@ dfltcc_inflate_action dfltcc_inflate( struct dfltcc_param_v0 *param = &dfltcc_state->param; dfltcc_cc cc; - if (flush == Z_BLOCK) { - /* DFLTCC does not support stopping on block boundaries */ + if (flush == Z_BLOCK || flush == Z_PACKET_FLUSH) { + /* DFLTCC does not support stopping on block boundaries (Z_BLOCK flush option) + * as well as the use of Z_PACKET_FLUSH option (used exclusively by PPP driver) + */ if (dfltcc_inflate_disable(strm)) { *ret = Z_STREAM_ERROR; return DFLTCC_INFLATE_BREAK; @@ -121,8 +127,6 @@ dfltcc_inflate_action dfltcc_inflate( /* Translate stream to parameter block */ param->cvt = CVT_ADLER32; param->sbb = state->bits; - param->hl = state->whave; /* Software and hardware history formats match */ - param->ho = (state->write - state->whave) & ((1 << HB_BITS) - 1); if (param->hl) param->nt = 0; /* Honor history for the first block */ param->cv = state->check; @@ -136,8 +140,6 @@ dfltcc_inflate_action dfltcc_inflate( strm->msg = oesc_msg(dfltcc_state->msg, param->oesc); state->last = cc == DFLTCC_CC_OK; state->bits = param->sbb; - state->whave = param->hl; - state->write = (param->ho + param->hl) & ((1 << HB_BITS) - 1); state->check = param->cv; if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) { /* Report an error if stream is corrupted */ diff --git a/lib/zlib_dfltcc/dfltcc_inflate.h b/lib/zlib_dfltcc/dfltcc_inflate.h new file mode 100644 index 0000000000000..98d4bc42e5267 --- /dev/null +++ b/lib/zlib_dfltcc/dfltcc_inflate.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: Zlib +#ifndef DFLTCC_INFLATE_H +#define DFLTCC_INFLATE_H + +#include "dfltcc.h" + +/* External functions */ +void dfltcc_reset_inflate_state(z_streamp strm); +int dfltcc_can_inflate(z_streamp strm); +typedef enum { + DFLTCC_INFLATE_CONTINUE, + DFLTCC_INFLATE_BREAK, + DFLTCC_INFLATE_SOFTWARE, +} dfltcc_inflate_action; +dfltcc_inflate_action dfltcc_inflate(z_streamp strm, + int flush, int *ret); +#define INFLATE_RESET_HOOK(strm) \ + dfltcc_reset_inflate_state((strm)) + +#define INFLATE_TYPEDO_HOOK(strm, flush) \ + if (dfltcc_can_inflate((strm))) { \ + dfltcc_inflate_action action; \ +\ + RESTORE(); \ + action = dfltcc_inflate((strm), (flush), &ret); \ + LOAD(); \ + if (action == DFLTCC_INFLATE_CONTINUE) \ + break; \ + else if (action == DFLTCC_INFLATE_BREAK) \ + goto inf_leave; \ + } + +#define INFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_inflate((strm))) + +#define INFLATE_NEED_UPDATEWINDOW(strm) (!dfltcc_can_inflate((strm))) + +#endif /* DFLTCC_DEFLATE_H */ diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index ee39b5eb71f7b..d1efad69f02b3 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -17,7 +17,7 @@ /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC -# include "../zlib_dfltcc/dfltcc.h" +# include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) diff --git a/mm/Kconfig b/mm/Kconfig index 4eb4afa53e6d1..28ee03f39e93f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -191,6 +191,25 @@ config ZSMALLOC_STAT information to userspace via debugfs. If unsure, say N. +config ZSMALLOC_CHAIN_SIZE + int "Maximum number of physical pages per-zspage" + default 8 + range 4 16 + depends on ZSMALLOC + help + This option sets the upper limit on the number of physical pages + that a zmalloc page (zspage) can consist of. The optimal zspage + chain size is calculated for each size class during the + initialization of the pool. + + Changing this option can alter the characteristics of size classes, + such as the number of pages per zspage and the number of objects + per zspage. This can also result in different configurations of + the pool, as zsmalloc merges size classes with similar + characteristics. + + For more information, see zsmalloc documentation. + menu "SLAB allocator options" choice @@ -1080,6 +1099,15 @@ comment "GUP_TEST needs to have DEBUG_FS enabled" config GUP_GET_PXX_LOW_HIGH bool +config DMAPOOL_TEST + tristate "Enable a module to run time tests on dma_pool" + depends on HAS_DMA + help + Provides a test module that will allocate and free many blocks of + various sizes and report how long it takes. This is intended to + provide a consistent way to measure how changes to the + dma_pool_alloc/free routines affect performance. + config ARCH_HAS_PTE_SPECIAL bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index d62f481319526..c3547a373c9ca 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -207,3 +207,75 @@ config PTDUMP_DEBUGFS kernel. If in doubt, say N. + +config HAVE_DEBUG_KMEMLEAK + bool + +config DEBUG_KMEMLEAK + bool "Kernel memory leak detector" + depends on DEBUG_KERNEL && HAVE_DEBUG_KMEMLEAK + select DEBUG_FS + select STACKTRACE if STACKTRACE_SUPPORT + select KALLSYMS + select CRC32 + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF + help + Say Y here if you want to enable the memory leak + detector. The memory allocation/freeing is traced in a way + similar to the Boehm's conservative garbage collector, the + difference being that the orphan objects are not freed but + only shown in /sys/kernel/debug/kmemleak. Enabling this + feature will introduce an overhead to memory + allocations. See Documentation/dev-tools/kmemleak.rst for more + details. + + Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances + of finding leaks due to the slab objects poisoning. + + In order to access the kmemleak file, debugfs needs to be + mounted (usually at /sys/kernel/debug). + +config DEBUG_KMEMLEAK_MEM_POOL_SIZE + int "Kmemleak memory pool size" + depends on DEBUG_KMEMLEAK + range 200 1000000 + default 16000 + help + Kmemleak must track all the memory allocations to avoid + reporting false positives. Since memory may be allocated or + freed before kmemleak is fully initialised, use a static pool + of metadata objects to track such callbacks. After kmemleak is + fully initialised, this memory pool acts as an emergency one + if slab allocations fail. + +config DEBUG_KMEMLEAK_TEST + tristate "Simple test for the kernel memory leak detector" + depends on DEBUG_KMEMLEAK && m + help + This option enables a module that explicitly leaks memory. + + If unsure, say N. + +config DEBUG_KMEMLEAK_DEFAULT_OFF + bool "Default kmemleak to off" + depends on DEBUG_KMEMLEAK + help + Say Y here to disable kmemleak by default. It can then be enabled + on the command line via kmemleak=on. + +config DEBUG_KMEMLEAK_AUTO_SCAN + bool "Enable kmemleak auto scan thread on boot up" + default y + depends on DEBUG_KMEMLEAK + help + Depending on the cpu, kmemleak scan may be cpu intensive and can + stall user tasks at times. This option enables/disables automatic + kmemleak scan at boot up. + + Say N here to disable kmemleak auto scan thread to stop automatic + scanning. Disabling this option disables automatic reporting of + memory leaks. + + If unsure, say Y. + diff --git a/mm/Makefile b/mm/Makefile index 8e105e5b3e293..3a08f5d7b1782 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o +obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/cma.c b/mm/cma.c index a75b17b03b66a..a7263aa02c92d 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -321,18 +321,6 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } else { phys_addr_t addr = 0; - /* - * All pages in the reserved area must come from the same zone. - * If the requested region crosses the low/high memory boundary, - * try allocating from high memory first and fall back to low - * memory in case of failure. - */ - if (base < highmem_start && limit > highmem_start) { - addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, true); - limit = highmem_start; - } - /* * If there is enough memory, try a bottom-up allocation first. * It will place the new cma area close to the start of the node @@ -350,6 +338,18 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, } #endif + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (!addr && base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range_nid(size, alignment, + highmem_start, limit, nid, true); + limit = highmem_start; + } + if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); diff --git a/mm/compaction.c b/mm/compaction.c index 9008ba4cd8515..98ff938e0532d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1101,12 +1101,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Avoid isolating too much unless this block is being - * rescanned (e.g. dirty/writeback pages, parallel allocation) + * fully scanned (e.g. dirty/writeback pages, parallel allocation) * or a lock is contended. For contention, isolate quickly to * potentially remove one source of contention. */ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && - !cc->rescan && !cc->contended) { + !cc->finish_pageblock && !cc->contended) { ++low_pfn; break; } @@ -1171,14 +1171,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Updated the cached scanner pfn once the pageblock has been scanned + * Update the cached scanner pfn once the pageblock has been scanned. * Pages will either be migrated in which case there is no point * scanning in the near future or migration failed in which case the * failure reason may persist. The block is marked for skipping if * there were no pages isolated in the block or if the block is * rescanned twice in a row. */ - if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (low_pfn == end_pfn && (!nr_isolated || cc->finish_pageblock)) { if (valid_page && !skip_updated) set_pageblock_skip(valid_page); update_cached_migrate(cc, low_pfn); @@ -1761,6 +1761,13 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) if (cc->ignore_skip_hint) return pfn; + /* + * If the pageblock should be finished then do not select a different + * pageblock. + */ + if (cc->finish_pageblock) + return pfn; + /* * If the migrate_pfn is not at the start of a zone or the start * of a pageblock then assume this is a continuation of a previous @@ -2026,6 +2033,8 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) struct zone *zone; zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; score += fragmentation_score_zone_weighted(zone); } @@ -2314,9 +2323,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) return ret; - /* huh, compaction_suitable is returning something unexpected */ - VM_BUG_ON(ret != COMPACT_CONTINUE); - /* * Clear pageblock skip if there were failures recently and compaction * is about to be retried after being deferred. @@ -2374,19 +2380,20 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long iteration_start_pfn = cc->migrate_pfn; /* - * Avoid multiple rescans which can happen if a page cannot be - * isolated (dirty/writeback in async mode) or if the migrated - * pages are being allocated before the pageblock is cleared. - * The first rescan will capture the entire pageblock for - * migration. If it fails, it'll be marked skip and scanning - * will proceed as normal. + * Avoid multiple rescans of the same pageblock which can + * happen if a page cannot be isolated (dirty/writeback in + * async mode) or if the migrated pages are being allocated + * before the pageblock is cleared. The first rescan will + * capture the entire pageblock for migration. If it fails, + * it'll be marked skip and scanning will proceed as normal. */ - cc->rescan = false; + cc->finish_pageblock = false; if (pageblock_start_pfn(last_migrated_pfn) == pageblock_start_pfn(iteration_start_pfn)) { - cc->rescan = true; + cc->finish_pageblock = true; } +rescan: switch (isolate_migratepages(cc)) { case ISOLATE_ABORT: ret = COMPACT_CONTENDED; @@ -2429,18 +2436,37 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto out; } /* - * We failed to migrate at least one page in the current - * order-aligned block, so skip the rest of it. + * If an ASYNC or SYNC_LIGHT fails to migrate a page + * within the current order-aligned block, scan the + * remainder of the pageblock. This will mark the + * pageblock "skip" to avoid rescanning in the near + * future. This will isolate more pages than necessary + * for the request but avoid loops due to + * fast_find_migrateblock revisiting blocks that were + * recently partially scanned. */ - if (cc->direct_compaction && - (cc->mode == MIGRATE_ASYNC)) { - cc->migrate_pfn = block_end_pfn( - cc->migrate_pfn - 1, cc->order); - /* Draining pcplists is useless in this case */ - last_migrated_pfn = 0; + if (cc->direct_compaction && !cc->finish_pageblock && + (cc->mode < MIGRATE_SYNC)) { + cc->finish_pageblock = true; + + /* + * Draining pcplists does not help THP if + * any page failed to migrate. Even after + * drain, the pageblock will not be free. + */ + if (cc->order == COMPACTION_HPAGE_ORDER) + last_migrated_pfn = 0; + + goto rescan; } } + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + check_drain: /* * Has the migration scanner moved away from the previous @@ -2459,12 +2485,6 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) last_migrated_pfn = 0; } } - - /* Stop if a page has been captured */ - if (capc && capc->page) { - ret = COMPACT_SUCCESS; - break; - } } out: @@ -2492,6 +2512,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); + VM_BUG_ON(!list_empty(&cc->freepages)); + VM_BUG_ON(!list_empty(&cc->migratepages)); + return ret; } @@ -2530,9 +2553,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, ret = compact_zone(&cc, &capc); - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); - /* * Make sure we hide capture control first before we read the captured * page pointer, otherwise an interrupt could free and capture a page @@ -2664,8 +2684,10 @@ static void proactive_compact_node(pg_data_t *pgdat) compact_zone(&cc, NULL); - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); } } @@ -2693,9 +2715,6 @@ static void compact_node(int nid) cc.zone = zone; compact_zone(&cc, NULL); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } } @@ -2735,6 +2754,8 @@ int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, continue; pgdat->proactive_compact_trigger = true; + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, -1, + pgdat->nr_zones - 1); wake_up_interruptible(&pgdat->kcompactd_wait); } } @@ -2872,9 +2893,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.total_migrate_scanned); count_compact_events(KCOMPACTD_FREE_SCANNED, cc.total_free_scanned); - - VM_BUG_ON(!list_empty(&cc.freepages)); - VM_BUG_ON(!list_empty(&cc.migratepages)); } /* diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 3db9b73687562..fae64d32b9257 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -289,6 +289,35 @@ static void damon_test_set_regions(struct kunit *test) damon_destroy_target(t); } +static void damon_test_update_monitoring_result(struct kunit *test) +{ + struct damon_attrs old_attrs = { + .sample_interval = 10, .aggr_interval = 1000,}; + struct damon_attrs new_attrs; + struct damon_region *r = damon_new_region(3, 7); + + r->nr_accesses = 15; + r->age = 20; + + new_attrs = (struct damon_attrs){ + .sample_interval = 100, .aggr_interval = 10000,}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 1000}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 2); + + new_attrs = (struct damon_attrs){ + .sample_interval = 1, .aggr_interval = 100}; + damon_update_monitoring_result(r, &old_attrs, &new_attrs); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); + KUNIT_EXPECT_EQ(test, r->age, 20); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -299,6 +328,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), KUNIT_CASE(damon_test_set_regions), + KUNIT_CASE(damon_test_update_monitoring_result), {}, }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 1bf0654ae189d..d9ef62047bf5f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -465,6 +465,76 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static unsigned int damon_age_for_new_attrs(unsigned int age, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return age * old_attrs->aggr_interval / new_attrs->aggr_interval; +} + +/* convert access ratio in bp (per 10,000) to nr_accesses */ +static unsigned int damon_accesses_bp_to_nr_accesses( + unsigned int accesses_bp, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return accesses_bp * max_nr_accesses / 10000; +} + +/* convert nr_accesses to access ratio in bp (per 10,000) */ +static unsigned int damon_nr_accesses_to_accesses_bp( + unsigned int nr_accesses, struct damon_attrs *attrs) +{ + unsigned int max_nr_accesses = + attrs->aggr_interval / attrs->sample_interval; + + return nr_accesses * 10000 / max_nr_accesses; +} + +static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + return damon_accesses_bp_to_nr_accesses( + damon_nr_accesses_to_accesses_bp( + nr_accesses, old_attrs), + new_attrs); +} + +static void damon_update_monitoring_result(struct damon_region *r, + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) +{ + r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, + old_attrs, new_attrs); + r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); +} + +/* + * region->nr_accesses is the number of sampling intervals in the last + * aggregation interval that access to the region has found, and region->age is + * the number of aggregation intervals that its access pattern has maintained. + * For the reason, the real meaning of the two fields depend on current + * sampling interval and aggregation interval. This function updates + * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. + */ +static void damon_update_monitoring_results(struct damon_ctx *ctx, + struct damon_attrs *new_attrs) +{ + struct damon_attrs *old_attrs = &ctx->attrs; + struct damon_target *t; + struct damon_region *r; + + /* if any interval is zero, simply forgive conversion */ + if (!old_attrs->sample_interval || !old_attrs->aggr_interval || + !new_attrs->sample_interval || + !new_attrs->aggr_interval) + return; + + damon_for_each_target(t, ctx) + damon_for_each_region(r, t) + damon_update_monitoring_result( + r, old_attrs, new_attrs); +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -482,6 +552,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; + damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; return 0; } @@ -1269,7 +1340,8 @@ static int kdamond_fn(void *data) if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - kdamond_apply_schemes(ctx); + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index bce37c4875402..c4b455b5ee30b 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,19 +14,26 @@ #include -static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, +static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, ssize_t nr_vmas) { - int i; + int i, ret = -ENOMEM; MA_STATE(mas, mt, 0, 0); if (!nr_vmas) - return; + return 0; mas_lock(&mas); - for (i = 0; i < nr_vmas; i++) - vma_mas_store(&vmas[i], &mas); + for (i = 0; i < nr_vmas; i++) { + mas_set_range(&mas, vmas[i].vm_start, vmas[i].vm_end - 1); + if (mas_store_gfp(&mas, &vmas[i], GFP_KERNEL)) + goto failed; + } + + ret = 0; +failed: mas_unlock(&mas); + return ret; } /* @@ -71,7 +78,8 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) }; mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); - __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); + if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas))) + kunit_skip(test, "Failed to create VMA tree"); __damon_va_three_regions(&mm, regions); diff --git a/mm/debug.c b/mm/debug.c index 7f8e5f744e420..96d594e162925 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,11 +94,11 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n", head, compound_order(head), - head_compound_mapcount(head), - head_subpages_mapcount(head), - head_compound_pincount(head)); + folio_entire_mapcount(folio), + folio_nr_pages_mapped(folio), + atomic_read(&folio->_pincount)); } #ifdef CONFIG_MEMCG @@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm) mm->def_flags, &mm->def_flags ); } +EXPORT_SYMBOL(dump_mm); static bool page_init_poisoning __read_mostly = true; diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index bb3328f46126a..af59cc7bd3071 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -810,15 +810,36 @@ static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) { -#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + unsigned long max_swap_offset; + swp_entry_t entry, entry2; + pte_t pte; pr_debug("Validating PTE swap exclusive\n"); + + /* See generic_max_swapfile_size(): probe the maximum offset */ + max_swap_offset = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0, ~0UL)))); + + /* Create a swp entry with all possible bits set */ + entry = swp_entry((1 << MAX_SWAPFILES_SHIFT) - 1, max_swap_offset); + + pte = swp_entry_to_pte(entry); + WARN_ON(pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_mkexclusive(pte); WARN_ON(!pte_swp_exclusive(pte)); + WARN_ON(!is_swap_pte(pte)); + WARN_ON(pte_swp_soft_dirty(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); + pte = pte_swp_clear_exclusive(pte); WARN_ON(pte_swp_exclusive(pte)); -#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ + WARN_ON(!is_swap_pte(pte)); + entry2 = pte_to_swp_entry(pte); + WARN_ON(memcmp(&entry, &entry2, sizeof(entry))); } static void __init pte_swap_tests(struct pgtable_debug_args *args) diff --git a/mm/dmapool.c b/mm/dmapool.c index a7eb5d0eb2da7..1920890ff8d3d 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -15,7 +15,7 @@ * represented by the 'struct dma_pool' which keeps a doubly-linked list of * allocated pages. Each page in the page_list is split into blocks of at * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked - * list of free blocks within the page. Used blocks aren't tracked, but we + * list of free blocks across all pages. Used blocks aren't tracked, but we * keep a count of how many are currently allocated from each page. */ @@ -40,13 +40,22 @@ #define DMAPOOL_DEBUG 1 #endif +struct dma_block { + struct dma_block *next_block; + dma_addr_t dma; +}; + struct dma_pool { /* the pool */ struct list_head page_list; spinlock_t lock; - size_t size; + struct dma_block *next_block; + size_t nr_blocks; + size_t nr_active; + size_t nr_pages; struct device *dev; - size_t allocation; - size_t boundary; + unsigned int size; + unsigned int allocation; + unsigned int boundary; char name[32]; struct list_head pools; }; @@ -55,8 +64,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ struct list_head page_list; void *vaddr; dma_addr_t dma; - unsigned int in_use; - unsigned int offset; }; static DEFINE_MUTEX(pools_lock); @@ -64,46 +71,133 @@ static DEFINE_MUTEX(pools_reg_lock); static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned temp; - unsigned size; - char *next; - struct dma_page *page; struct dma_pool *pool; + unsigned size; - next = buf; - size = PAGE_SIZE; - - temp = scnprintf(next, size, "poolinfo - 0.1\n"); - size -= temp; - next += temp; + size = sysfs_emit(buf, "poolinfo - 0.1\n"); mutex_lock(&pools_lock); list_for_each_entry(pool, &dev->dma_pools, pools) { - unsigned pages = 0; - unsigned blocks = 0; - - spin_lock_irq(&pool->lock); - list_for_each_entry(page, &pool->page_list, page_list) { - pages++; - blocks += page->in_use; - } - spin_unlock_irq(&pool->lock); - /* per-pool info, no real statistics yet */ - temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n", - pool->name, blocks, - pages * (pool->allocation / pool->size), - pool->size, pages); - size -= temp; - next += temp; + size += sysfs_emit_at(buf, size, "%-16s %4zu %4zu %4u %2zu\n", + pool->name, pool->nr_active, + pool->nr_blocks, pool->size, + pool->nr_pages); } mutex_unlock(&pools_lock); - return PAGE_SIZE - size; + return size; } static DEVICE_ATTR_RO(pools); +#ifdef DMAPOOL_DEBUG +static void pool_check_block(struct dma_pool *pool, struct dma_block *block, + gfp_t mem_flags) +{ + u8 *data = (void *)block; + int i; + + for (i = sizeof(struct dma_block); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + dev_err(pool->dev, "%s %s, %p (corrupted)\n", __func__, + pool->name, block); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + + if (!want_init_on_alloc(mem_flags)) + memset(block, POOL_POISON_ALLOCATED, pool->size); +} + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + struct dma_page *page; + + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if ((dma - page->dma) < pool->allocation) + return page; + } + return NULL; +} + +static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + struct dma_block *block = pool->next_block; + struct dma_page *page; + + page = pool_find_page(pool, dma); + if (!page) { + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); + return true; + } + + while (block) { + if (block != vaddr) { + block = block->next_block; + continue; + } + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); + return true; + } + + memset(vaddr, POOL_POISON_FREED, pool->size); + return false; +} + +static void pool_init_page(struct dma_pool *pool, struct dma_page *page) +{ + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +} +#else +static void pool_check_block(struct dma_pool *pool, struct dma_block *block, + gfp_t mem_flags) +{ +} + +static bool pool_block_err(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + if (want_init_on_free()) + memset(vaddr, 0, pool->size); + return false; +} + +static void pool_init_page(struct dma_pool *pool, struct dma_page *page) +{ +} +#endif + +static struct dma_block *pool_block_pop(struct dma_pool *pool) +{ + struct dma_block *block = pool->next_block; + + if (block) { + pool->next_block = block->next_block; + pool->nr_active++; + } + return block; +} + +static void pool_block_push(struct dma_pool *pool, struct dma_block *block, + dma_addr_t dma) +{ + block->dma = dma; + block->next_block = pool->next_block; + pool->next_block = block; +} + + /** * dma_pool_create - Creates a pool of consistent memory blocks, for dma. * @name: name of pool, for diagnostics @@ -132,17 +226,20 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, { struct dma_pool *retval; size_t allocation; - bool empty = false; + bool empty; + + if (!dev) + return NULL; if (align == 0) align = 1; else if (align & (align - 1)) return NULL; - if (size == 0) + if (size == 0 || size > INT_MAX) return NULL; - else if (size < 4) - size = 4; + if (size < sizeof(struct dma_block)) + size = sizeof(struct dma_block); size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); @@ -152,7 +249,9 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc(sizeof(*retval), GFP_KERNEL); + boundary = min(boundary, allocation); + + retval = kzalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; @@ -165,7 +264,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; - INIT_LIST_HEAD(&retval->pools); /* @@ -178,8 +276,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, */ mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - empty = true; + empty = list_empty(&dev->dma_pools); list_add(&retval->pools, &dev->dma_pools); mutex_unlock(&pools_lock); if (empty) { @@ -202,18 +299,25 @@ EXPORT_SYMBOL(dma_pool_create); static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) { - unsigned int offset = 0; - unsigned int next_boundary = pool->boundary; + unsigned int next_boundary = pool->boundary, offset = 0; + struct dma_block *block; - do { - unsigned int next = offset + pool->size; - if (unlikely((next + pool->size) >= next_boundary)) { - next = next_boundary; + pool_init_page(pool, page); + while (offset + pool->size <= pool->allocation) { + if (offset + pool->size > next_boundary) { + offset = next_boundary; next_boundary += pool->boundary; + continue; } - *(int *)(page->vaddr + offset) = next; - offset = next; - } while (offset < pool->allocation); + + block = page->vaddr + offset; + pool_block_push(pool, block, page->dma + offset); + offset += pool->size; + pool->nr_blocks++; + } + + list_add(&page->page_list, &pool->page_list); + pool->nr_pages++; } static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) @@ -223,37 +327,15 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) page = kmalloc(sizeof(*page), mem_flags); if (!page) return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, &page->dma, mem_flags); - if (page->vaddr) { -#ifdef DMAPOOL_DEBUG - memset(page->vaddr, POOL_POISON_FREED, pool->allocation); -#endif - pool_initialise_page(pool, page); - page->in_use = 0; - page->offset = 0; - } else { + if (!page->vaddr) { kfree(page); - page = NULL; + return NULL; } - return page; -} -static inline bool is_page_busy(struct dma_page *page) -{ - return page->in_use != 0; -} - -static void pool_free_page(struct dma_pool *pool, struct dma_page *page) -{ - dma_addr_t dma = page->dma; - -#ifdef DMAPOOL_DEBUG - memset(page->vaddr, POOL_POISON_FREED, pool->allocation); -#endif - dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); - list_del(&page->page_list); - kfree(page); + return page; } /** @@ -267,7 +349,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) void dma_pool_destroy(struct dma_pool *pool) { struct dma_page *page, *tmp; - bool empty = false; + bool empty, busy = false; if (unlikely(!pool)) return; @@ -275,26 +357,23 @@ void dma_pool_destroy(struct dma_pool *pool) mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); - if (pool->dev && list_empty(&pool->dev->dma_pools)) - empty = true; + empty = list_empty(&pool->dev->dma_pools); mutex_unlock(&pools_lock); if (empty) device_remove_file(pool->dev, &dev_attr_pools); mutex_unlock(&pools_reg_lock); + if (pool->nr_active) { + dev_err(pool->dev, "%s %s busy\n", __func__, pool->name); + busy = true; + } + list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { - if (is_page_busy(page)) { - if (pool->dev) - dev_err(pool->dev, "%s %s, %p busy\n", __func__, - pool->name, page->vaddr); - else - pr_err("%s %s, %p busy\n", __func__, - pool->name, page->vaddr); - /* leak the still-in-use consistent memory */ - list_del(&page->page_list); - kfree(page); - } else - pool_free_page(pool, page); + if (!busy) + dma_free_coherent(pool->dev, pool->allocation, + page->vaddr, page->dma); + list_del(&page->page_list); + kfree(page); } kfree(pool); @@ -314,84 +393,40 @@ EXPORT_SYMBOL(dma_pool_destroy); void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle) { - unsigned long flags; + struct dma_block *block; struct dma_page *page; - size_t offset; - void *retval; + unsigned long flags; might_alloc(mem_flags); spin_lock_irqsave(&pool->lock, flags); - list_for_each_entry(page, &pool->page_list, page_list) { - if (page->offset < pool->allocation) - goto ready; - } - - /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ - spin_unlock_irqrestore(&pool->lock, flags); - - page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); - if (!page) - return NULL; + block = pool_block_pop(pool); + if (!block) { + /* + * pool_alloc_page() might sleep, so temporarily drop + * &pool->lock + */ + spin_unlock_irqrestore(&pool->lock, flags); - spin_lock_irqsave(&pool->lock, flags); + page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); + if (!page) + return NULL; - list_add(&page->page_list, &pool->page_list); - ready: - page->in_use++; - offset = page->offset; - page->offset = *(int *)(page->vaddr + offset); - retval = offset + page->vaddr; - *handle = offset + page->dma; -#ifdef DMAPOOL_DEBUG - { - int i; - u8 *data = retval; - /* page->offset is stored in first 4 bytes */ - for (i = sizeof(page->offset); i < pool->size; i++) { - if (data[i] == POOL_POISON_FREED) - continue; - if (pool->dev) - dev_err(pool->dev, "%s %s, %p (corrupted)\n", - __func__, pool->name, retval); - else - pr_err("%s %s, %p (corrupted)\n", - __func__, pool->name, retval); - - /* - * Dump the first 4 bytes even if they are not - * POOL_POISON_FREED - */ - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, - data, pool->size, 1); - break; - } + spin_lock_irqsave(&pool->lock, flags); + pool_initialise_page(pool, page); + block = pool_block_pop(pool); } - if (!(mem_flags & __GFP_ZERO)) - memset(retval, POOL_POISON_ALLOCATED, pool->size); -#endif spin_unlock_irqrestore(&pool->lock, flags); + *handle = block->dma; + pool_check_block(pool, block, mem_flags); if (want_init_on_alloc(mem_flags)) - memset(retval, 0, pool->size); + memset(block, 0, pool->size); - return retval; + return block; } EXPORT_SYMBOL(dma_pool_alloc); -static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) -{ - struct dma_page *page; - - list_for_each_entry(page, &pool->page_list, page_list) { - if (dma < page->dma) - continue; - if ((dma - page->dma) < pool->allocation) - return page; - } - return NULL; -} - /** * dma_pool_free - put block back into dma pool * @pool: the dma pool holding the block @@ -403,65 +438,14 @@ static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) */ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) { - struct dma_page *page; + struct dma_block *block = vaddr; unsigned long flags; - unsigned int offset; spin_lock_irqsave(&pool->lock, flags); - page = pool_find_page(pool, dma); - if (!page) { - spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", - __func__, pool->name, vaddr, &dma); - else - pr_err("%s %s, %p/%pad (bad dma)\n", - __func__, pool->name, vaddr, &dma); - return; + if (!pool_block_err(pool, vaddr, dma)) { + pool_block_push(pool, block, dma); + pool->nr_active--; } - - offset = vaddr - page->vaddr; - if (want_init_on_free()) - memset(vaddr, 0, pool->size); -#ifdef DMAPOOL_DEBUG - if ((dma - page->dma) != offset) { - spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", - __func__, pool->name, vaddr, &dma); - else - pr_err("%s %s, %p (bad vaddr)/%pad\n", - __func__, pool->name, vaddr, &dma); - return; - } - { - unsigned int chain = page->offset; - while (chain < pool->allocation) { - if (chain != offset) { - chain = *(int *)(page->vaddr + chain); - continue; - } - spin_unlock_irqrestore(&pool->lock, flags); - if (pool->dev) - dev_err(pool->dev, "%s %s, dma %pad already free\n", - __func__, pool->name, &dma); - else - pr_err("%s %s, dma %pad already free\n", - __func__, pool->name, &dma); - return; - } - } - memset(vaddr, POOL_POISON_FREED, pool->size); -#endif - - page->in_use--; - *(int *)vaddr = page->offset; - page->offset = offset; - /* - * Resist a temptation to do - * if (!is_page_busy(page)) pool_free_page(pool, page); - * Better have a few empty pages hang around. - */ spin_unlock_irqrestore(&pool->lock, flags); } EXPORT_SYMBOL(dma_pool_free); diff --git a/mm/dmapool_test.c b/mm/dmapool_test.c new file mode 100644 index 0000000000000..370fb9e209eff --- /dev/null +++ b/mm/dmapool_test.c @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include +#include +#include + +#define NR_TESTS (100) + +struct dma_pool_pair { + dma_addr_t dma; + void *v; +}; + +struct dmapool_parms { + size_t size; + size_t align; + size_t boundary; +}; + +static const struct dmapool_parms pool_parms[] = { + { .size = 16, .align = 16, .boundary = 0 }, + { .size = 64, .align = 64, .boundary = 0 }, + { .size = 256, .align = 256, .boundary = 0 }, + { .size = 1024, .align = 1024, .boundary = 0 }, + { .size = 4096, .align = 4096, .boundary = 0 }, + { .size = 68, .align = 32, .boundary = 4096 }, +}; + +static struct dma_pool *pool; +static struct device test_dev; +static u64 dma_mask; + +static inline int nr_blocks(int size) +{ + return clamp_t(int, (PAGE_SIZE / size) * 512, 1024, 8192); +} + +static int dmapool_test_alloc(struct dma_pool_pair *p, int blocks) +{ + int i; + + for (i = 0; i < blocks; i++) { + p[i].v = dma_pool_alloc(pool, GFP_KERNEL, + &p[i].dma); + if (!p[i].v) + goto pool_fail; + } + + for (i = 0; i < blocks; i++) + dma_pool_free(pool, p[i].v, p[i].dma); + + return 0; + +pool_fail: + for (--i; i >= 0; i--) + dma_pool_free(pool, p[i].v, p[i].dma); + return -ENOMEM; +} + +static int dmapool_test_block(const struct dmapool_parms *parms) +{ + int blocks = nr_blocks(parms->size); + ktime_t start_time, end_time; + struct dma_pool_pair *p; + int i, ret; + + p = kcalloc(blocks, sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + pool = dma_pool_create("test pool", &test_dev, parms->size, + parms->align, parms->boundary); + if (!pool) { + ret = -ENOMEM; + goto free_pairs; + } + + start_time = ktime_get(); + for (i = 0; i < NR_TESTS; i++) { + ret = dmapool_test_alloc(p, blocks); + if (ret) + goto free_pool; + if (need_resched()) + cond_resched(); + } + end_time = ktime_get(); + + printk("dmapool test: size:%-4zu align:%-4zu blocks:%-4d time:%llu\n", + parms->size, parms->align, blocks, + ktime_us_delta(end_time, start_time)); + +free_pool: + dma_pool_destroy(pool); +free_pairs: + kfree(p); + return ret; +} + +static void dmapool_test_release(struct device *dev) +{ +} + +static int dmapool_checks(void) +{ + int i, ret; + + ret = dev_set_name(&test_dev, "dmapool-test"); + if (ret) + return ret; + + ret = device_register(&test_dev); + if (ret) { + printk("%s: register failed:%d\n", __func__, ret); + goto put_device; + } + + test_dev.release = dmapool_test_release; + set_dma_ops(&test_dev, NULL); + test_dev.dma_mask = &dma_mask; + ret = dma_set_mask_and_coherent(&test_dev, DMA_BIT_MASK(64)); + if (ret) { + printk("%s: mask failed:%d\n", __func__, ret); + goto del_device; + } + + for (i = 0; i < ARRAY_SIZE(pool_parms); i++) { + ret = dmapool_test_block(&pool_parms[i]); + if (ret) + break; + } + +del_device: + device_del(&test_dev); +put_device: + put_device(&test_dev); + return ret; +} + +static void dmapool_exit(void) +{ +} + +module_init(dmapool_checks); +module_exit(dmapool_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/filemap.c b/mm/filemap.c index c4d4ace9cc700..992554c18f1f8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -97,7 +97,7 @@ * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem - * ->anon_vma.lock (vma_adjust) + * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) @@ -470,7 +470,7 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - struct page *page; + struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; @@ -479,11 +479,11 @@ bool filemap_range_has_page(struct address_space *mapping, rcu_read_lock(); for (;;) { - page = xas_find(&xas, max); - if (xas_retry(&xas, page)) + folio = xas_find(&xas, max); + if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to @@ -494,7 +494,7 @@ bool filemap_range_has_page(struct address_space *mapping, } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); @@ -503,25 +503,27 @@ static void __filemap_fdatawait_range(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + unsigned nr_folios; + + folio_batch_init(&fbatch); - pagevec_init(&pvec); while (index <= end) { unsigned i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK); - if (!nr_pages) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + PAGECACHE_TAG_WRITEBACK, &fbatch); + + if (!nr_folios) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - wait_on_page_writeback(page); - ClearPageError(page); + folio_wait_writeback(folio); + folio_clear_error(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } } @@ -2282,64 +2284,58 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, EXPORT_SYMBOL(filemap_get_folios_contig); /** - * find_get_pages_range_tag - Find and return head pages matching @tag. - * @mapping: the address_space to search - * @index: the starting page index - * @end: The final page index (inclusive) - * @tag: the tag index - * @nr_pages: the maximum number of pages - * @pages: where the resulting pages are placed + * filemap_get_folios_tag - Get a batch of folios matching @tag + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @tag: The tag index + * @fbatch: The batch to fill * - * Like find_get_pages_range(), except we only return head pages which are - * tagged with @tag. @index is updated to the index immediately after the - * last page we return, ready for the next iteration. + * Same as filemap_get_folios(), but only returning folios tagged with @tag. * - * Return: the number of pages which were found. + * Return: The number of folios found. + * Also update @start to index the next folio for traversal. */ -unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, xa_mark_t tag, unsigned int nr_pages, - struct page **pages) +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, *index); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; - unsigned ret = 0; - - if (unlikely(!nr_pages)) - return 0; rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, tag))) { + while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict - * a page we saw tagged. Skip over it. + * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); - pages[ret] = &folio->page; - if (++ret == nr_pages) { - *index = folio->index + folio_nr_pages(folio); + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; goto out; } } - /* - * We come here when we got to @end. We take care to not overflow the - * index @index as it confuses some of the callers. This breaks the - * iteration when there is a page at index -1 but that is already - * broken anyway. + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broke anyway. */ if (end == (pgoff_t)-1) - *index = (pgoff_t)-1; + *start = (pgoff_t)-1; else - *index = end + 1; + *start = end + 1; out: rcu_read_unlock(); - return ret; + return folio_batch_count(fbatch); } -EXPORT_SYMBOL(find_get_pages_range_tag); +EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail @@ -3263,22 +3259,24 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) +static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, + pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } - if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { + struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ - unlock_page(page); + folio_unlock(folio); return true; } } @@ -3288,8 +3286,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return true; } @@ -3372,7 +3370,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, &folio->page)) { + if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 69ed25790c68d..18c48b5579263 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -6,6 +6,7 @@ #include #include +#include #include #include "internal.h" @@ -123,3 +124,13 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_MMU +void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + return folio_add_new_anon_rmap((struct folio *)page, vma, address); +} +#endif diff --git a/mm/gup.c b/mm/gup.c index 41abb16286ec0..9bb969a24d8f0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -111,7 +111,7 @@ static inline struct folio *try_get_folio(struct page *page, int refs) * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by - * @refs, and its compound_pincount will be incremented by @refs. + * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. @@ -157,7 +157,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * try_get_folio() is left intact. */ if (folio_test_large(folio)) - atomic_add(refs, folio_pincount_ptr(folio)); + atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); @@ -186,7 +186,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); #endif if (folio_test_large(folio)) - atomic_sub(refs, folio_pincount_ptr(folio)); + atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } @@ -236,7 +236,7 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) */ if (folio_test_large(folio)) { folio_ref_add(folio, 1); - atomic_add(1, folio_pincount_ptr(folio)); + atomic_add(1, &folio->_pincount); } else { folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } @@ -1060,7 +1060,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page + * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because @@ -1910,7 +1910,7 @@ static unsigned long collect_longterm_unpinnable_pages( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(&folio->page, movable_page_list); + isolate_hugetlb(folio, movable_page_list); continue; } @@ -1919,7 +1919,7 @@ static unsigned long collect_longterm_unpinnable_pages( drain_allow = false; } - if (!folio_isolate_lru(folio)) + if (folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_page_list); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7e68a36b4f7d5..1d6977dc6b31b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma, !enforce_sysfs); + return shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff, + !enforce_sysfs, vma->vm_mm, vm_flags); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && @@ -559,10 +560,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) } #ifdef CONFIG_MEMCG -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct mem_cgroup *memcg = page_memcg(compound_head(page)); - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); if (memcg) return &memcg->deferred_split_queue; @@ -570,9 +572,10 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) return &pgdat->deferred_split_queue; } #else -static inline struct deferred_split *get_deferred_split_queue(struct page *page) +static inline +struct deferred_split *get_deferred_split_queue(struct folio *folio) { - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + struct pglist_data *pgdat = NODE_DATA(folio_nid(folio)); return &pgdat->deferred_split_queue; } @@ -580,23 +583,23 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) void prep_transhuge_page(struct page *page) { - /* - * we use page->mapping and page->index in second tail page - * as list_head: assuming THP order >= 2 - */ + struct folio *folio = (struct folio *)page; - INIT_LIST_HEAD(page_deferred_list(page)); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + INIT_LIST_HEAD(&folio->_deferred_list); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } static inline bool is_transparent_hugepage(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return false; - page = compound_head(page); - return is_huge_zero_page(page) || - page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; + folio = page_folio(page); + return is_huge_zero_page(&folio->page) || + folio->_folio_dtor == TRANSHUGE_PAGE_DTOR; } static unsigned long __thp_get_unmapped_area(struct file *filp, @@ -2020,7 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2282,7 +2285,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2477,9 +2480,9 @@ static void __split_huge_page_tail(struct page *head, int tail, * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. * - * What of 32-bit systems, on which head[1].compound_pincount overlays + * What of 32-bit systems, on which folio->_pincount overlays * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + * pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); @@ -2650,7 +2653,7 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; @@ -2754,9 +2757,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(&folio->page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(&folio->page)); + list_del(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { @@ -2800,52 +2803,53 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) void free_transhuge_page(struct page *page) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct folio *folio = (struct folio *)page; + struct deferred_split *ds_queue = get_deferred_split_queue(folio); unsigned long flags; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(page_deferred_list(page))) { + if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(page_deferred_list(page)); + list_del(&folio->_deferred_list); } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); free_compound_page(page); } -void deferred_split_huge_page(struct page *page) +void deferred_split_folio(struct folio *folio) { - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(folio); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = page_memcg(compound_head(page)); + struct mem_cgroup *memcg = folio_memcg(folio); #endif unsigned long flags; - VM_BUG_ON_PAGE(!PageTransHuge(page), page); + VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); /* * The try_to_unmap() in page reclaim path might reach here too, * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same page, it is + * And, if page reclaim is already handling the same folio, it is * unnecessary to handle it again in shrinker. * - * Check PageSwapCache to determine if the page is being - * handled by page reclaim since THP swap would add the page into + * Check the swapcache flag to determine if the folio is being + * handled by page reclaim since THP swap would add the folio into * swap cache before calling try_to_unmap(). */ - if (PageSwapCache(page)) + if (folio_test_swapcache(folio)) return; - if (!list_empty(page_deferred_list(page))) + if (!list_empty(&folio->_deferred_list)) return; spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (list_empty(page_deferred_list(page))) { + if (list_empty(&folio->_deferred_list)) { count_vm_event(THP_DEFERRED_SPLIT_PAGE); - list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + list_add_tail(&folio->_deferred_list, &ds_queue->split_queue); ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - set_shrinker_bit(memcg, page_to_nid(page), + set_shrinker_bit(memcg, folio_nid(folio), deferred_split_shrinker.id); #endif } @@ -2871,8 +2875,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct pglist_data *pgdata = NODE_DATA(sc->nid); struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; - LIST_HEAD(list), *pos, *next; - struct page *page; + LIST_HEAD(list); + struct folio *folio, *next; int split = 0; #ifdef CONFIG_MEMCG @@ -2882,14 +2886,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, spin_lock_irqsave(&ds_queue->split_queue_lock, flags); /* Take pin on all head pages to avoid freeing them under us */ - list_for_each_safe(pos, next, &ds_queue->split_queue) { - page = list_entry((void *)pos, struct page, deferred_list); - page = compound_head(page); - if (get_page_unless_zero(page)) { - list_move(page_deferred_list(page), &list); + list_for_each_entry_safe(folio, next, &ds_queue->split_queue, + _deferred_list) { + if (folio_try_get(folio)) { + list_move(&folio->_deferred_list, &list); } else { - /* We lost race with put_compound_page() */ - list_del_init(page_deferred_list(page)); + /* We lost race with folio_put() */ + list_del_init(&folio->_deferred_list); ds_queue->split_queue_len--; } if (!--sc->nr_to_scan) @@ -2897,16 +2900,15 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); - list_for_each_safe(pos, next, &list) { - page = list_entry((void *)pos, struct page, deferred_list); - if (!trylock_page(page)) + list_for_each_entry_safe(folio, next, &list, _deferred_list) { + if (!folio_trylock(folio)) goto next; /* split_huge_page() removes page from list on success */ - if (!split_huge_page(page)) + if (!split_folio(folio)) split++; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6fe65f14d33b0..3a01a9dbf445a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1282,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) folio_set_hugetlb_freed(folio); } -static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, + int nid) { - struct page *page; + struct folio *folio; bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_longterm_pinnable_page(page)) + list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) { + if (pin && !folio_is_longterm_pinnable(folio)) continue; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) continue; - list_move(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); - ClearHPageFreed(page); + list_move(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - return page; + return folio; } return NULL; } -static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, - nodemask_t *nmask) +static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { unsigned int cpuset_mems_cookie; struct zonelist *zonelist; @@ -1320,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { - struct page *page; + struct folio *folio; if (!cpuset_zone_allowed(zone, gfp_mask)) continue; @@ -1332,9 +1333,9 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, continue; node = zone_to_nid(zone); - page = dequeue_huge_page_node_exact(h, node); - if (page) - return page; + folio = dequeue_hugetlb_folio_node_exact(h, node); + if (folio) + return folio; } if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; @@ -1347,12 +1348,12 @@ static unsigned long available_huge_pages(struct hstate *h) return h->free_huge_pages - h->resv_huge_pages; } -static struct page *dequeue_huge_page_vma(struct hstate *h, +static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask; nodemask_t *nodemask; @@ -1374,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (!folio) + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + nid, nodemask); - if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetHPageRestoreReserve(page); + if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } mpol_cond_put(mpol); - return page; + return folio; err: return NULL; @@ -1474,9 +1477,9 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, int nr_pages = 1 << order; struct page *p; - atomic_set(folio_mapcount_ptr(folio), 0); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_pincount, 0); for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); @@ -1698,10 +1701,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, enqueue_hugetlb_folio(h, folio); } -static void __update_and_free_page(struct hstate *h, struct page *page) +static void __update_and_free_hugetlb_folio(struct hstate *h, + struct folio *folio) { int i; - struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1714,7 +1717,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, page)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1731,7 +1734,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * which makes any healthy subpages reusable. */ if (unlikely(folio_test_hwpoison(folio))) - hugetlb_clear_page_hwpoison(&folio->page); + folio_clear_hugetlb_hwpoison(folio); for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); @@ -1750,7 +1753,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { - __free_pages(page, huge_page_order(h)); + __free_pages(&folio->page, huge_page_order(h)); } } @@ -1790,7 +1793,7 @@ static void free_hpage_workfn(struct work_struct *work) */ h = size_to_hstate(page_size(page)); - __update_and_free_page(h, page); + __update_and_free_hugetlb_folio(h, page_folio(page)); cond_resched(); } @@ -1807,7 +1810,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { - __update_and_free_page(h, &folio->page); + __update_and_free_hugetlb_folio(h, folio); return; } @@ -1996,9 +1999,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, if (i != 0) set_compound_head(p, &folio->page); } - atomic_set(folio_mapcount_ptr(folio), -1); - atomic_set(folio_subpages_mapcount_ptr(folio), 0); - atomic_set(folio_pincount_ptr(folio), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_pincount, 0); return true; out_error: @@ -2038,11 +2041,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, */ int PageHuge(struct page *page) { + struct folio *folio; + if (!PageCompound(page)) return 0; - - page = compound_head(page); - return page[1].compound_dtor == HUGETLB_PAGE_DTOR; + folio = page_folio(page); + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -2052,10 +2056,11 @@ EXPORT_SYMBOL_GPL(PageHuge); */ int PageHeadHuge(struct page *page_head) { - if (!PageHead(page_head)) + struct folio *folio = (struct folio *)page_head; + if (!folio_test_large(folio)) return 0; - return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; + return folio->_folio_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHeadHuge); @@ -2373,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask) +static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio = NULL; @@ -2411,10 +2416,10 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, out_unlock: spin_unlock_irq(&hugetlb_lock); - return &folio->page; + return folio; } -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; @@ -2434,17 +2439,17 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, */ folio_set_hugetlb_temporary(folio); - return &folio->page; + return folio; } /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct page *page = NULL; + struct folio *folio = NULL; struct mempolicy *mpol; gfp_t gfp_mask = htlb_alloc_mask(h); int nid; @@ -2455,53 +2460,54 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } - if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + if (!folio) + folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); - return page; + return folio; } -/* page migration callback function */ -struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, +/* folio migration callback function */ +struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { - struct page *page; + struct folio *folio; - page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); - if (page) { + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, + preferred_nid, nmask); + if (folio) { spin_unlock_irq(&hugetlb_lock); - return page; + return folio; } } spin_unlock_irq(&hugetlb_lock); - return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; - struct page *page; + struct folio *folio; gfp_t gfp_mask; int node; gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); - return page; + return folio; } /* @@ -2512,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); + struct folio *folio; struct page *page, *tmp; int ret; long i; @@ -2531,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); - if (!page) { + if (!folio) { alloc_ok = false; break; } - list_add(&page->lru, &surplus_list); + list_add(&folio->lru, &surplus_list); cond_resched(); } allocated += i; @@ -2791,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h, /* * This routine is called to restore reservation information on error paths. - * It should ONLY be called for pages allocated via alloc_huge_page(), and - * the hugetlb mutex should remain held when calling this routine. + * It should ONLY be called for folios allocated via alloc_hugetlb_folio(), + * and the hugetlb mutex should remain held when calling this routine. * * It handles two specific cases: - * 1) A reservation was in place and the page consumed the reservation. - * HPageRestoreReserve is set in the page. - * 2) No reservation was in place for the page, so HPageRestoreReserve is - * not set. However, alloc_huge_page always updates the reserve map. + * 1) A reservation was in place and the folio consumed the reservation. + * hugetlb_restore_reserve is set in the folio. + * 2) No reservation was in place for the page, so hugetlb_restore_reserve is + * not set. However, alloc_hugetlb_folio always updates the reserve map. * * In case 1, free_huge_page later in the error path will increment the * global reserve count. But, free_huge_page does not have enough context @@ -2807,27 +2814,27 @@ static long vma_del_reservation(struct hstate *h, * reserve count adjustments to be made by free_huge_page. Make sure the * reserve map indicates there is a reservation present. * - * In case 2, simply undo reserve map modifications done by alloc_huge_page. + * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. */ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, struct page *page) + unsigned long address, struct folio *folio) { long rc = vma_needs_reservation(h, vma, address); - if (HPageRestoreReserve(page)) { + if (folio_test_hugetlb_restore_reserve(folio)) { if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map - * manipulation. Clear HPageRestoreReserve so that - * global reserve count will not be incremented + * manipulation. Clear hugetlb_restore_reserve so + * that global reserve count will not be incremented * by free_huge_page. This will make it appear - * as though the reservation for this page was + * as though the reservation for this folio was * consumed. This may prevent the task from - * faulting in the page at a later time. This + * faulting in the folio at a later time. This * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); else if (rc) (void)vma_add_reservation(h, vma, address); else @@ -2836,9 +2843,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, if (!rc) { /* * This indicates there is an entry in the reserve map - * not added by alloc_huge_page. We know it was added - * before the alloc_huge_page call, otherwise - * HPageRestoreReserve would be set on the page. + * not added by alloc_hugetlb_folio. We know it was added + * before the alloc_hugetlb_folio call, otherwise + * hugetlb_restore_reserve would be set on the folio. * Remove the entry so that a subsequent allocation * does not consume a reservation. */ @@ -2847,12 +2854,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * VERY rare out of memory condition. Since * we can not delete the entry, set - * HPageRestoreReserve so that the reserve - * count will be incremented when the page + * hugetlb_restore_reserve so that the reserve + * count will be incremented when the folio * is freed. This reserve will be consumed * on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else if (rc < 0) { /* * Rare out of memory condition from @@ -2868,12 +2875,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * For private mappings, no entry indicates * a reservation is present. Since we can - * not add an entry, set SetHPageRestoreReserve - * on the page so reserve count will be + * not add an entry, set hugetlb_restore_reserve + * on the folio so reserve count will be * incremented when freed. This reserve will * be consumed on a subsequent allocation. */ - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); } else /* * No reservation present, do nothing @@ -2923,7 +2930,7 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(&old_folio->page, list); + ret = isolate_hugetlb(old_folio, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { @@ -2998,7 +3005,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -3006,17 +3013,16 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } -struct page *alloc_huge_page(struct vm_area_struct *vma, +struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); - struct page *page; struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; - struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg = NULL; bool deferred_reserve; idx = hstate_index(h); @@ -3075,34 +3081,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { + folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!folio) { spin_unlock_irq(&hugetlb_lock); - page = alloc_buddy_huge_page_with_mpol(h, vma, addr); - if (!page) + folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); + if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetHPageRestoreReserve(page); + folio_set_hugetlb_restore_reserve(folio); h->resv_huge_pages--; } - list_add(&page->lru, &h->hugepage_activelist); - set_page_refcounted(page); + list_add(&folio->lru, &h->hugepage_activelist); + folio_ref_unfreeze(folio, 1); /* Fall through */ } - folio = page_folio(page); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ if (deferred_reserve) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), - h_cg, page); + h_cg, folio); } spin_unlock_irq(&hugetlb_lock); - hugetlb_set_page_subpool(page, spool); + hugetlb_set_folio_subpool(folio, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3123,7 +3129,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } - return page; + return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); @@ -3490,7 +3496,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_hugetlb_folio() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -3533,7 +3539,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_hugetlb_folio() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -3572,12 +3578,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, return 0; } -static int demote_free_huge_page(struct hstate *h, struct page *page) +static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i, nid = page_to_nid(page); + int i, nid = folio_nid(folio); struct hstate *target_hstate; - struct folio *folio = page_folio(page); struct page *subpage; + struct folio *inner_folio; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); @@ -3585,18 +3591,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, page); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (rc) { - /* Allocation of vmemmmap failed, we can not demote page */ + /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); - set_page_refcounted(page); - add_hugetlb_folio(h, page_folio(page), false); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); return rc; } /* * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count pages. + * sizes as it will not ref count folios. */ destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); @@ -3611,15 +3617,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) mutex_lock(&target_hstate->resize_lock); for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { - subpage = nth_page(page, i); - folio = page_folio(subpage); + subpage = folio_page(folio, i); + inner_folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(folio, + prep_compound_gigantic_folio_for_demote(inner_folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); - set_page_private(subpage, 0); - prep_new_hugetlb_folio(target_hstate, folio, nid); + folio_change_private(inner_folio, NULL); + prep_new_hugetlb_folio(target_hstate, inner_folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -3641,7 +3647,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct page *page; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); @@ -3652,11 +3658,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) } for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(page, &h->hugepage_freelists[node], lru) { - if (PageHWPoison(page)) + list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + if (folio_test_hwpoison(folio)) continue; - - return demote_free_huge_page(h, page); + return demote_free_hugetlb_folio(h, folio); } } @@ -4940,14 +4945,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) } static void -hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct page *new_page) +hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct folio *new_folio) { - __SetPageUptodate(new_page); - hugepage_add_new_anon_rmap(new_page, vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + __folio_mark_uptodate(new_folio); + hugepage_add_new_anon_rmap(new_folio, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - SetHPageMigratable(new_page); + folio_set_hugetlb_migratable(new_folio); } int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, @@ -4966,7 +4971,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, int ret = 0; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -5074,34 +5079,34 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, } else if (page_try_dup_anon_rmap(ptepage, true, src_vma)) { pte_t src_pte_old = entry; - struct page *new; + struct folio *new_folio; spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new = alloc_huge_page(dst_vma, addr, 1); - if (IS_ERR(new)) { + new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + if (IS_ERR(new_folio)) { put_page(ptepage); - ret = PTR_ERR(new); + ret = PTR_ERR(new_folio); break; } - copy_user_huge_page(new, ptepage, addr, dst_vma, + copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma, npages); put_page(ptepage); - /* Install the new huge page if src pte stable */ + /* Install the new hugetlb folio if src pte stable */ dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, - new); - put_page(new); + new_folio); + folio_put(new_folio); /* huge_ptep of dst_pte won't change as in child */ goto again; } - hugetlb_install_page(dst_vma, dst_pte, addr, new); + hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5177,7 +5182,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mmu_notifier_range range; bool shared_pmd = false; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* @@ -5391,7 +5396,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); @@ -5467,12 +5472,13 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags, - struct page *pagecache_page, spinlock_t *ptl) + struct folio *pagecache_folio, spinlock_t *ptl) { const bool unshare = flags & FAULT_FLAG_UNSHARE; pte_t pte; struct hstate *h = hstate_vma(vma); - struct page *old_page, *new_page; + struct page *old_page; + struct folio *new_folio; int outside_reserve = 0; vm_fault_t ret = 0; unsigned long haddr = address & huge_page_mask(h); @@ -5523,7 +5529,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_page != pagecache_page) + page_folio(old_page) != pagecache_folio) outside_reserve = 1; get_page(old_page); @@ -5533,9 +5539,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * be acquired again before returning to the caller, as expected. */ spin_unlock(ptl); - new_page = alloc_huge_page(vma, haddr, outside_reserve); + new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); - if (IS_ERR(new_page)) { + if (IS_ERR(new_folio)) { /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -5580,7 +5586,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } - ret = vmf_error(PTR_ERR(new_page)); + ret = vmf_error(PTR_ERR(new_folio)); goto out_release_old; } @@ -5593,11 +5599,11 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, goto out_release_all; } - copy_user_huge_page(new_page, old_page, address, vma, + copy_user_huge_page(&new_folio->page, old_page, address, vma, pages_per_huge_page(h)); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -5612,12 +5618,12 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(old_page, vma, true); - hugepage_add_new_anon_rmap(new_page, vma, haddr); + hugepage_add_new_anon_rmap(new_folio, vma, haddr); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, !unshare)); - SetHPageMigratable(new_page); + make_huge_pte(vma, &new_folio->page, !unshare)); + folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ - new_page = old_page; + new_folio = page_folio(old_page); } spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); @@ -5626,9 +5632,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * No restore in case of successful pagetable update (Break COW or * unshare) */ - if (new_page != old_page) - restore_reserve_on_error(h, vma, haddr, new_page); - put_page(new_page); + if (new_folio != page_folio(old_page)) + restore_reserve_on_error(h, vma, haddr, new_folio); + folio_put(new_folio); out_release_old: put_page(old_page); @@ -5645,23 +5651,20 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { - struct address_space *mapping; - pgoff_t idx; - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, vma, address); + bool present; - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + rcu_read_lock(); + present = page_cache_next_miss(mapping, idx, 1) != idx; + rcu_read_unlock(); - page = find_get_page(mapping, idx); - if (page) - put_page(page); - return page != NULL; + return present; } -int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct hstate *h = hstate_inode(inode); int err; @@ -5673,7 +5676,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, __folio_clear_locked(folio); return err; } - ClearHPageRestoreReserve(page); + folio_clear_hugetlb_restore_reserve(folio); /* * mark folio dirty so that it will not be removed from cache/file @@ -5749,11 +5752,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; - struct page *page; + struct folio *folio; pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); - bool new_page, new_pagecache_page = false; + bool new_folio, new_pagecache_folio = false; u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* @@ -5772,9 +5775,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * Use page lock to guard against racing truncation * before we get page_table_lock. */ - new_page = false; - page = find_lock_page(mapping, idx); - if (!page) { + new_folio = false; + folio = filemap_lock_folio(mapping, idx); + if (!folio) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; @@ -5807,8 +5810,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, VM_UFFD_MISSING); } - page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(vma, haddr, 0); + if (IS_ERR(folio)) { /* * Returning error will result in faulting task being * sent SIGBUS. The hugetlb fault mutex prevents two @@ -5822,17 +5825,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * sure there really is no pte entry. */ if (hugetlb_pte_stable(h, mm, ptep, old_pte)) - ret = vmf_error(PTR_ERR(page)); + ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(page, address, pages_per_huge_page(h)); - __SetPageUptodate(page); - new_page = true; + clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + __folio_mark_uptodate(folio); + new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5841,13 +5844,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, page); - put_page(page); + restore_reserve_on_error(h, vma, haddr, folio); + folio_put(folio); goto out; } - new_pagecache_page = true; + new_pagecache_folio = true; } else { - lock_page(page); + folio_lock(folio); if (unlikely(anon_vma_prepare(vma))) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -5860,7 +5863,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * don't have hwpoisoned swap entry for errored virtual address. * So we need to block hugepage fault by PG_hwpoison bit check. */ - if (unlikely(PageHWPoison(page))) { + if (unlikely(folio_test_hwpoison(folio))) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; @@ -5868,8 +5871,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* See comment in userfaultfd_missing() block above */ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { ret = 0; @@ -5903,10 +5906,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto backout; if (anon_rmap) - hugepage_add_new_anon_rmap(page, vma, haddr); + hugepage_add_new_anon_rmap(folio, vma, haddr); else - page_dup_file_rmap(page, true); - new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + page_dup_file_rmap(&folio->page, true); + new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); /* * If this pte was previously wr-protected, keep it wr-protected even @@ -5919,20 +5922,20 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); } spin_unlock(ptl); /* - * Only set HPageMigratable in newly allocated pages. Existing pages - * found in the pagecache may not have HPageMigratableset if they have + * Only set hugetlb_migratable in newly allocated pages. Existing pages + * found in the pagecache may not have hugetlb_migratable if they have * been isolated for migration. */ - if (new_page) - SetHPageMigratable(page); + if (new_folio) + folio_set_hugetlb_migratable(folio); - unlock_page(page); + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -5941,11 +5944,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, backout: spin_unlock(ptl); backout_unlocked: - if (new_page && !new_pagecache_page) - restore_reserve_on_error(h, vma, haddr, page); + if (new_folio && !new_pagecache_folio) + restore_reserve_on_error(h, vma, haddr, folio); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } @@ -5982,7 +5985,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, u32 hash; pgoff_t idx; struct page *page = NULL; - struct page *pagecache_page = NULL; + struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; @@ -6064,7 +6067,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_page = find_lock_page(mapping, idx); + pagecache_folio = filemap_lock_folio(mapping, idx); } ptl = huge_pte_lock(h, mm, ptep); @@ -6084,9 +6087,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }; spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -6095,11 +6098,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * hugetlb_wp() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * pagecache_folio, so here we need take the former one + * when page != pagecache_folio or !pagecache_folio. */ page = pte_page(entry); - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) if (!trylock_page(page)) { need_wait_lock = 1; goto out_ptl; @@ -6110,7 +6113,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(entry)) { ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_page, ptl); + pagecache_folio, ptl); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { entry = huge_pte_mkdirty(entry); @@ -6121,15 +6124,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: - if (page != pagecache_page) + if (page_folio(page) != pagecache_folio) unlock_page(page); put_page(page); out_ptl: spin_unlock(ptl); - if (pagecache_page) { - unlock_page(pagecache_page); - put_page(pagecache_page); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); } out_mutex: hugetlb_vma_unlock_read(vma); @@ -6169,16 +6172,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; - struct page *page; + struct folio *folio; int writable; - bool page_in_pagecache = false; + bool folio_in_pagecache = false; if (is_continue) { ret = -EFAULT; - page = find_lock_page(mapping, idx); - if (!page) + folio = filemap_lock_folio(mapping, idx); + if (!folio) goto out; - page_in_pagecache = true; + folio_in_pagecache = true; } else if (!*pagep) { /* If a page already exists, then it's UFFDIO_COPY for * a non-missing case. Return -EEXIST. @@ -6189,34 +6192,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } - ret = copy_huge_page_from_user(page, + ret = copy_huge_page_from_user(&folio->page, (const void __user *) src_addr, pages_per_huge_page(h), false); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret = -ENOENT; - /* Free the allocated page which may have + /* Free the allocated folio which may have * consumed a reservation. */ - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + restore_reserve_on_error(h, dst_vma, dst_addr, folio); + folio_put(folio); - /* Allocate a temporary page to hold the copied + /* Allocate a temporary folio to hold the copied * contents. */ - page = alloc_huge_page_vma(h, dst_vma, dst_addr); - if (!page) { + folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; /* Set the outparam pagep and return to the caller to * copy the contents outside the lock. Don't free the * page. @@ -6232,25 +6235,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; } - page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) { + folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + if (IS_ERR(folio)) { put_page(*pagep); ret = -ENOMEM; *pagep = NULL; goto out; } - copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma, pages_per_huge_page(h)); put_page(*pagep); *pagep = NULL; } /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { @@ -6265,16 +6268,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = hugetlb_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(folio, mapping, idx); if (ret) goto out_release_nounlock; - page_in_pagecache = true; + folio_in_pagecache = true; } ptl = huge_pte_lock(h, dst_mm, dst_pte); ret = -EIO; - if (PageHWPoison(page)) + if (folio_test_hwpoison(folio)) goto out_release_unlock; /* @@ -6286,10 +6289,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) - page_dup_file_rmap(page, true); + if (folio_in_pagecache) + page_dup_file_rmap(&folio->page, true); else - hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr); /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6300,7 +6303,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, else writable = dst_vma->vm_flags & VM_WRITE; - _dst_pte = make_huge_pte(dst_vma, page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6322,20 +6325,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spin_unlock(ptl); if (!is_continue) - SetHPageMigratable(page); + folio_set_hugetlb_migratable(folio); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); if (vm_shared || is_continue) - unlock_page(page); + folio_unlock(folio); out_release_nounlock: - if (!page_in_pagecache) - restore_reserve_on_error(h, dst_vma, dst_addr, page); - put_page(page); + if (!folio_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, folio); + folio_put(folio); goto out; } #endif /* CONFIG_USERFAULTFD */ @@ -6637,7 +6640,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, * range if PMD sharing is possible. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, - 0, vma, mm, start, end); + 0, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -6867,7 +6870,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* * pages in this range were added to the reserve * map between region_chg and region_add. This - * indicates a race with alloc_huge_page. Adjust + * indicates a race with alloc_hugetlb_folio. Adjust * the subpool and reserve counts modified above * based on the difference. */ @@ -6967,8 +6970,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the @@ -7248,36 +7251,36 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -int isolate_hugetlb(struct page *page, struct list_head *list) +int isolate_hugetlb(struct folio *folio, struct list_head *list) { int ret = 0; spin_lock_irq(&hugetlb_lock); - if (!PageHeadHuge(page) || - !HPageMigratable(page) || - !get_page_unless_zero(page)) { + if (!folio_test_hugetlb(folio) || + !folio_test_hugetlb_migratable(folio) || + !folio_try_get(folio)) { ret = -EBUSY; goto unlock; } - ClearHPageMigratable(page); - list_move_tail(&page->lru, list); + folio_clear_hugetlb_migratable(folio); + list_move_tail(&folio->lru, list); unlock: spin_unlock_irq(&hugetlb_lock); return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) +int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison) { int ret = 0; *hugetlb = false; spin_lock_irq(&hugetlb_lock); - if (PageHeadHuge(page)) { + if (folio_test_hugetlb(folio)) { *hugetlb = true; - if (HPageFreed(page)) + if (folio_test_hugetlb_freed(folio)) ret = 0; - else if (HPageMigratable(page) || unpoison) - ret = get_page_unless_zero(page); + else if (folio_test_hugetlb_migratable(folio) || unpoison) + ret = folio_try_get(folio); else ret = -EBUSY; } @@ -7296,13 +7299,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void putback_active_hugepage(struct page *page) +void folio_putback_active_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); - SetHPageMigratable(page); - list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + folio_set_hugetlb_migratable(folio); + list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); - put_page(page); + folio_put(folio); } void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) @@ -7368,7 +7371,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d9e4425d81ac4..dedd2edb076ec 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -331,19 +331,15 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { - struct folio *folio = page_folio(page); - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } diff --git a/mm/internal.h b/mm/internal.h index 1d6f4e1685105..90bb2078444c1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) @@ -52,6 +52,24 @@ struct folio_batch; void page_writeback_init(void); +/* + * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, + * its nr_pages_mapped would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) + +/* + * How many individual pages have an elevated _mapcount. Excludes + * the folio's entire_mapcount. + */ +static inline int folio_nr_pages_mapped(struct folio *folio) +{ + return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; +} + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -141,17 +159,6 @@ static inline bool folio_evictable(struct folio *folio) return ret; } -static inline bool page_evictable(struct page *page) -{ - bool ret; - - /* Prevent address_space of inode and swap cache from being freed */ - rcu_read_lock(); - ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); - rcu_read_unlock(); - return ret; -} - /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. @@ -441,7 +448,11 @@ struct compact_control { bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ - bool rescan; /* Rescanning the same pageblock */ + bool finish_pageblock; /* Scan the remainder of a pageblock. Used + * when there are potentially transient + * isolation or migration failures to + * ensure forward progress. + */ bool alloc_contig; /* alloc_contig_range allocation */ }; @@ -511,14 +522,13 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, unsigned long len); /* - * mlock_vma_page() and munlock_vma_page(): + * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * - * mlock is usually called at the end of page_add_*_rmap(), - * munlock at the end of page_remove_rmap(); but new anon - * pages are managed by lru_cache_add_inactive_or_unevictable() - * calling mlock_new_page(). + * mlock is usually called at the end of page_add_*_rmap(), munlock at + * the end of page_remove_rmap(); but new anon folios are managed by + * folio_add_lru_vma() calling mlock_new_folio(). * * @compound is used to include pmd mappings of THPs, but filter out * pte mappings of THPs, which cannot be consistently counted: a pte @@ -541,24 +551,19 @@ static inline void mlock_vma_folio(struct folio *folio, mlock_folio(folio); } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - mlock_vma_folio(page_folio(page), vma, compound); -} - -void munlock_page(struct page *page); -static inline void munlock_vma_page(struct page *page, +void munlock_folio(struct folio *folio); +static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma, bool compound) { if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !PageTransCompound(page))) - munlock_page(page); + (compound || !folio_test_large(folio))) + munlock_folio(folio); } -void mlock_new_page(struct page *page); -bool need_mlock_page_drain(int cpu); -void mlock_page_drain_local(void); -void mlock_page_drain_remote(int cpu); + +void mlock_new_folio(struct folio *folio); +bool need_mlock_drain(int cpu); +void mlock_drain_local(void); +void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); @@ -643,14 +648,10 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } -static inline void mlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } -static inline void munlock_vma_page(struct page *page, - struct vm_area_struct *vma, bool compound) { } -static inline void mlock_new_page(struct page *page) { } -static inline bool need_mlock_page_drain(int cpu) { return false; } -static inline void mlock_page_drain_local(void) { } -static inline void mlock_page_drain_remote(int cpu) { } +static inline void mlock_new_folio(struct folio *folio) { } +static inline bool need_mlock_drain(int cpu) { return false; } +static inline void mlock_drain_local(void) { } +static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } @@ -754,8 +755,13 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access + * to 25% of the min watermark or + * 62.5% if __GFP_HIGH is set. + */ +#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 @@ -763,8 +769,12 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #else #define ALLOC_NOFRAGMENT 0x0 #endif +#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ +/* Flags that allow allocations below the min watermark. */ +#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + enum ttu_flags; struct tlbflush_unmap_batch; @@ -867,4 +877,82 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +/* + * VMA Iterator functions shared between nommu and mmap + */ +static inline int vma_iter_prealloc(struct vma_iterator *vmi) +{ + return mas_preallocate(&vmi->mas, GFP_KERNEL); +} + +static inline void vma_iter_clear(struct vma_iterator *vmi, + unsigned long start, unsigned long end) +{ + mas_set_range(&vmi->mas, start, end - 1); + mas_store_prealloc(&vmi->mas, NULL); +} + +static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) +{ + return mas_walk(&vmi->mas); +} + +/* Store a VMA with preallocated memory */ +static inline void vma_iter_store(struct vma_iterator *vmi, + struct vm_area_struct *vma) +{ + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.index > vma->vm_start)) { + printk("%lu > %lu\n", vmi->mas.index, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } + if (WARN_ON(vmi->mas.node != MAS_START && vmi->mas.last < vma->vm_start)) { + printk("%lu < %lu\n", vmi->mas.last, vma->vm_start); + printk("store of vma %lu-%lu", vma->vm_start, vma->vm_end); + printk("into slot %lu-%lu", vmi->mas.index, vmi->mas.last); + mt_dump(vmi->mas.tree); + } +#endif + + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_prealloc(&vmi->mas, vma); +} + +static inline int vma_iter_store_gfp(struct vma_iterator *vmi, + struct vm_area_struct *vma, gfp_t gfp) +{ + if (vmi->mas.node != MAS_START && + ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) + vma_iter_invalidate(vmi); + + vmi->mas.index = vma->vm_start; + vmi->mas.last = vma->vm_end - 1; + mas_store_gfp(&vmi->mas, vma, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + +/* + * VMA lock generalization + */ +struct vma_prepare { + struct vm_area_struct *vma; + struct vm_area_struct *adj_next; + struct file *file; + struct address_space *mapping; + struct anon_vma *anon_vma; + struct vm_area_struct *insert; + struct vm_area_struct *remove; + struct vm_area_struct *remove2; +}; #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6b8e9c8485739..b376a5d055e55 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) @@ -246,6 +246,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { + if (!kasan_arch_is_ready()) + return false; + if (ptr != page_address(virt_to_head_page(ptr))) { kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index b076f597a3781..e5eef670735ef 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -172,10 +172,8 @@ static __always_inline bool check_region_inline(unsigned long addr, if (unlikely(addr + size < addr)) return !kasan_report(addr, size, write, ret_ip); - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata((void *)addr))) return !kasan_report(addr, size, write, ret_ip); - } if (likely(!memory_is_poisoned(addr, size))) return true; @@ -191,7 +189,12 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, bool kasan_byte_accessible(const void *addr) { - s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); + s8 shadow_byte; + + if (!kasan_arch_is_ready()) + return true; + + shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dcc2a88e81219..3231314e071f0 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -207,6 +207,7 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + size_t alloc_size; /* Filled in by the mode-specific reporting code. */ const char *bug_type; @@ -323,6 +324,7 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ void *kasan_find_first_bad_addr(void *addr, size_t size); +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache); void kasan_complete_mode_report_info(struct kasan_report_info *info); void kasan_metadata_fetch_row(char *buffer, void *row); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 22598b20c7b75..e0492124e90a7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -231,33 +231,46 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(const void *addr, struct kmem_cache *cache, - void *object) +static void describe_object_addr(const void *addr, struct kasan_report_info *info) { unsigned long access_addr = (unsigned long)addr; - unsigned long object_addr = (unsigned long)object; - const char *rel_type; + unsigned long object_addr = (unsigned long)info->object; + const char *rel_type, *region_state = ""; int rel_bytes; pr_err("The buggy address belongs to the object at %px\n" " which belongs to the cache %s of size %d\n", - object, cache->name, cache->object_size); + info->object, info->cache->name, info->cache->object_size); if (access_addr < object_addr) { rel_type = "to the left"; rel_bytes = object_addr - access_addr; - } else if (access_addr >= object_addr + cache->object_size) { + } else if (access_addr >= object_addr + info->alloc_size) { rel_type = "to the right"; - rel_bytes = access_addr - (object_addr + cache->object_size); + rel_bytes = access_addr - (object_addr + info->alloc_size); } else { rel_type = "inside"; rel_bytes = access_addr - object_addr; } + /* + * Tag-Based modes use the stack ring to infer the bug type, but the + * memory region state description is generated based on the metadata. + * Thus, defining the region state as below can contradict the metadata. + * Fixing this requires further improvements, so only infer the state + * for the Generic mode. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { + if (strcmp(info->bug_type, "slab-out-of-bounds") == 0) + region_state = "allocated "; + else if (strcmp(info->bug_type, "slab-use-after-free") == 0) + region_state = "freed "; + } + pr_err("The buggy address is located %d bytes %s of\n" - " %d-byte region [%px, %px)\n", - rel_bytes, rel_type, cache->object_size, (void *)object_addr, - (void *)(object_addr + cache->object_size)); + " %s%lu-byte region [%px, %px)\n", + rel_bytes, rel_type, region_state, info->alloc_size, + (void *)object_addr, (void *)(object_addr + info->alloc_size)); } static void describe_object_stacks(struct kasan_report_info *info) @@ -279,7 +292,7 @@ static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) describe_object_stacks(info); - describe_object_addr(addr, info->cache, info->object); + describe_object_addr(addr, info); } static inline bool kernel_or_module_addr(const void *addr) @@ -436,6 +449,12 @@ static void complete_report_info(struct kasan_report_info *info) if (slab) { info->cache = slab->slab_cache; info->object = nearest_obj(info->cache, slab, addr); + + /* Try to determine allocation size based on the metadata. */ + info->alloc_size = kasan_get_alloc_size(info->object, info->cache); + /* Fallback to the object size if failed. */ + if (!info->alloc_size) + info->alloc_size = info->cache->object_size; } else info->cache = info->object = NULL; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 043c94b046054..87d39bc0a6735 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -43,6 +43,34 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow == 0) + size += KASAN_GRANULE_SIZE; + else if (*shadow >= 1 && *shadow <= KASAN_GRANULE_SIZE - 1) + return size + *shadow; + else + return size; + shadow++; + } + + return cache->object_size; +} + static const char *get_shadow_bug_type(struct kasan_report_info *info) { const char *bug_type = "unknown-crash"; @@ -79,9 +107,11 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info) bug_type = "stack-out-of-bounds"; break; case KASAN_PAGE_FREE: + bug_type = "use-after-free"; + break; case KASAN_SLAB_FREE: case KASAN_SLAB_FREETRACK: - bug_type = "use-after-free"; + bug_type = "slab-use-after-free"; break; case KASAN_ALLOCA_LEFT: case KASAN_ALLOCA_RIGHT: diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index f3d3be614e4b0..32e80f78de7d0 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -17,10 +17,43 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) { - /* Return the same value regardless of whether addr_has_metadata(). */ + /* + * Hardware Tag-Based KASAN only calls this function for normal memory + * accesses, and thus addr points precisely to the first bad address + * with an invalid (and present) memory tag. Therefore: + * 1. Return the address as is without walking memory tags. + * 2. Skip the addr_has_metadata check. + */ return kasan_reset_tag(addr); } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + int i = 0; + u8 memory_tag; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + while (size < cache->object_size) { + memory_tag = hw_get_mem_tag(object + i * KASAN_GRANULE_SIZE); + if (memory_tag != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + i++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { int i; diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 7a26397297edb..8b1f5a73ee6d3 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -45,6 +45,32 @@ void *kasan_find_first_bad_addr(void *addr, size_t size) return p; } +size_t kasan_get_alloc_size(void *object, struct kmem_cache *cache) +{ + size_t size = 0; + u8 *shadow; + + /* + * Skip the addr_has_metadata check, as this function only operates on + * slab memory, which must have metadata. + */ + + /* + * The loop below returns 0 for freed objects, for which KASAN cannot + * calculate the allocation size based on the metadata. + */ + shadow = (u8 *)kasan_mem_to_shadow(object); + while (size < cache->object_size) { + if (*shadow != KASAN_TAG_INVALID) + size += KASAN_GRANULE_SIZE; + else + return size; + shadow++; + } + + return cache->object_size; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index ecede06ef374a..8b8bfdb3cfdb5 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -89,7 +89,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * a use-after-free. */ if (!info->bug_type) - info->bug_type = "use-after-free"; + info->bug_type = "slab-use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 98269936a5e4b..3703983a8e556 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -329,6 +329,9 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) unsigned long shadow_start, shadow_end; int ret; + if (!kasan_arch_is_ready()) + return 0; + if (!is_vmalloc_or_module_addr((void *)addr)) return 0; @@ -497,6 +500,9 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; + if (!kasan_arch_is_ready()) + return; + region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -540,6 +546,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ + if (!kasan_arch_is_ready()) + return (void *)start; + if (!is_vmalloc_or_module_addr(start)) return (void *)start; @@ -562,6 +571,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { + if (!kasan_arch_is_ready()) + return; + if (!is_vmalloc_or_module_addr(start)) return; diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index a3afaf2ad1b11..30da65fa02a1e 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -106,10 +106,8 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; untagged_addr = kasan_reset_tag((const void *)addr); - if (unlikely(untagged_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_metadata(untagged_addr))) return !kasan_report(addr, size, write, ret_ip); - } shadow_first = kasan_mem_to_shadow(untagged_addr); shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); for (shadow = shadow_first; shadow <= shadow_last; shadow++) { @@ -127,7 +125,7 @@ bool kasan_byte_accessible(const void *addr) void *untagged_addr = kasan_reset_tag(addr); u8 shadow_byte; - if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + if (!addr_has_metadata(untagged_addr)) return false; shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90acfea40c13c..eb38bd1b1b2f0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ enum scan_result { SCAN_CGROUP_CHARGE_FAIL, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, + SCAN_COPY_MC, }; #define CREATE_TRACE_POINTS @@ -490,32 +492,37 @@ void __khugepaged_exit(struct mm_struct *mm) } } +static void release_pte_folio(struct folio *folio) +{ + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + -folio_nr_pages(folio)); + folio_unlock(folio); + folio_putback_lru(folio); +} + static void release_pte_page(struct page *page) { - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - -compound_nr(page)); - unlock_page(page); - putback_lru_page(page); + release_pte_folio(page_folio(page)); } static void release_pte_pages(pte_t *pte, pte_t *_pte, struct list_head *compound_pagelist) { - struct page *page, *tmp; + struct folio *folio, *tmp; while (--_pte >= pte) { pte_t pteval = *_pte; - page = pte_page(pteval); + folio = pfn_folio(pte_pfn(pteval)); if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && - !PageCompound(page)) - release_pte_page(page); + !folio_test_large(folio)) + release_pte_folio(folio); } - list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { - list_del(&page->lru); - release_pte_page(page); + list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { + list_del(&folio->lru); + release_pte_folio(folio); } } @@ -530,6 +537,27 @@ static bool is_refcount_suitable(struct page *page) return page_count(page) == expected_refcount; } +/* + * Copies memory with #MC in source page (@from) handled. Returns number + * of bytes not copied if there was an exception; otherwise 0 for success. + * Note handling #MC requires arch opt-in. + */ +static int copy_mc_page(struct page *to, struct page *from) +{ + char *vfrom, *vto; + unsigned long ret; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (ret == 0) + kmsan_copy_page_meta(to, from); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} + static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, @@ -670,56 +698,124 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, return result; } -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) +/* + * __collapse_huge_page_copy - attempts to copy memory contents from normal + * pages to a hugepage. Cleans up the normal pages if copying succeeds; + * otherwise restores the original page table and releases isolated normal pages. + * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. + * + * @pte: starting of the PTEs to copy from + * @page: the new hugepage to copy contents to + * @pmd: pointer to the new hugepage's PMD + * @rollback: the original normal pages' PMD + * @vma: the original normal pages' virtual memory area + * @address: starting address to copy + * @pte_ptl: lock on normal pages' PTEs + * @compound_pagelist: list that stores compound pages + */ +static int __collapse_huge_page_copy(pte_t *pte, + struct page *page, + pmd_t *pmd, + pmd_t rollback, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *pte_ptl, + struct list_head *compound_pagelist) { struct page *src_page, *tmp; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, address += PAGE_SIZE) { - pte_t pteval = *_pte; + pte_t pteval; + unsigned long _address; + spinlock_t *pmd_ptl; + int result = SCAN_SUCCEED; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { + /* + * Copying pages' contents is subject to memory poison at any iteration. + */ + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, _address += PAGE_SIZE) { + pteval = *_pte; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) + clear_user_highpage(page, _address); + else { + src_page = pte_page(pteval); + if (copy_mc_page(page, src_page) > 0) { + result = SCAN_COPY_MC; + break; + } + } + } + + if (likely(result == SCAN_SUCCEED)) { + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pteval = *_pte; + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * pte_ptl mostly unnecessary. + */ + spin_lock(pte_ptl); + pte_clear(vma->vm_mm, _address, _pte); + spin_unlock(pte_ptl); + } + } else { + src_page = pte_page(pteval); + if (!PageCompound(src_page)) + release_pte_page(src_page); /* - * ptl mostly unnecessary. + * pte_ptl mostly unnecessary, but preempt has + * to be disabled to update the per-cpu stats + * inside page_remove_rmap(). */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); + spin_lock(pte_ptl); + ptep_clear(vma->vm_mm, _address, _pte); + page_remove_rmap(src_page, vma, false); + spin_unlock(pte_ptl); + free_page_and_swap_cache(src_page); } - } else { - src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); - if (!PageCompound(src_page)) - release_pte_page(src_page); - /* - * ptl mostly unnecessary, but preempt has to - * be disabled to update the per-cpu stats - * inside page_remove_rmap(). - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, vma, false); - spin_unlock(ptl); - free_page_and_swap_cache(src_page); + } + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); + } + } else { + /* + * Re-establish the regular PMD that points to the regular + * page table. Restoring PMD needs to be done prior to + * releasing pages. Since pages are still isolated and + * locked here, acquiring anon_vma_lock_write is unnecessary. + */ + pmd_ptl = pmd_lock(vma->vm_mm, pmd); + pmd_populate(vma->vm_mm, pmd, pmd_pgtable(rollback)); + spin_unlock(pmd_ptl); + /* + * Release both raw and compound pages isolated + * in __collapse_huge_page_isolate. + */ + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pteval = *_pte; + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) { + src_page = pte_page(pteval); + if (!PageCompound(src_page)) + release_pte_page(src_page); + } + } + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + release_pte_page(src_page); } } - list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { - list_del(&src_page->lru); - mod_node_page_state(page_pgdat(src_page), - NR_ISOLATED_ANON + page_is_file_lru(src_page), - -compound_nr(src_page)); - unlock_page(src_page); - free_swap_cache(src_page); - putback_lru_page(src_page); - } + return result; } static void khugepaged_alloc_sleep(void) @@ -1040,8 +1136,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, anon_vma_lock_write(vma->anon_vma); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, - address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pte = pte_offset_map(pmd, address); @@ -1087,9 +1183,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, - &compound_pagelist); + result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, + vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); + if (unlikely(result != SCAN_SUCCEED)) + goto out_up_write; + /* * spin_lock() below is not the equivalent of smp_wmb(), but * the smp_wmb() inside __SetPageUptodate() can be reused to @@ -1412,7 +1512,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); @@ -1762,6 +1862,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; + struct page *page; + struct page *tmp; + struct folio *folio; pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1806,8 +1909,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { - struct page *page = xas_next(&xas); - struct folio *folio; + page = xas_next(&xas); VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1989,10 +2091,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } nr = thp_nr_pages(hpage); - if (is_shmem) - __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); - else { - __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + if (!is_shmem) { filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -2003,21 +2102,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } - - if (nr_none) { - __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); - /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); - } - - /* Join all the small entries into a single multi-index entry */ - xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -2030,21 +2118,35 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, try_to_unmap_flush(); if (result == SCAN_SUCCEED) { - struct page *page, *tmp; - struct folio *folio; - /* * Replacing old pages with new one has succeeded, now we - * need to copy the content and free the old pages. + * attempt to copy the contents. */ index = start; - list_for_each_entry_safe(page, tmp, &pagelist, lru) { + list_for_each_entry(page, &pagelist, lru) { while (index < page->index) { clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(hpage + (page->index % HPAGE_PMD_NR), - page); + if (copy_mc_page(hpage + (page->index % HPAGE_PMD_NR), + page) > 0) { + result = SCAN_COPY_MC; + break; + } + index++; + } + while (result == SCAN_SUCCEED && index < end) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } + } + + if (result == SCAN_SUCCEED) { + /* + * Copying old pages to huge one has succeeded, now we + * need to free the old pages. + */ + list_for_each_entry_safe(page, tmp, &pagelist, lru) { list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -2052,12 +2154,23 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, ClearPageUnevictable(page); unlock_page(page); put_page(page); - index++; } - while (index < end) { - clear_highpage(hpage + (index % HPAGE_PMD_NR)); - index++; + + xas_lock_irq(&xas); + if (is_shmem) + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + else + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + + if (nr_none) { + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } + /* Join all the small entries into a single multi-index entry. */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, hpage); + xas_unlock_irq(&xas); folio = page_folio(hpage); folio_mark_uptodate(folio); @@ -2075,8 +2188,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, unlock_page(hpage); hpage = NULL; } else { - struct page *page; - /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); if (nr_none) { @@ -2110,6 +2221,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_lock_irq(&xas); } VM_BUG_ON(nr_none); + /* + * Undo the updates of filemap_nr_thps_inc for non-SHMEM file only. + * This undo is not needed unless failure is due to SCAN_COPY_MC. + */ + if (!is_shmem && result == SCAN_COPY_MC) + filemap_nr_thps_dec(mapping); + xas_unlock_irq(&xas); hpage->mapping = NULL; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 55dc8b8b06162..374d65219c410 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -13,11 +13,12 @@ * * The following locks and mutexes are used by kmemleak: * - * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and - * accesses to the object_tree_root (or object_phys_tree_root). The - * object_list is the main list holding the metadata (struct kmemleak_object) - * for the allocated memory blocks. The object_tree_root and object_phys_tree_root - * are red black trees used to look-up metadata based on a pointer to the + * - kmemleak_lock (raw_spinlock_t): protects the object_list as well as + * del_state modifications and accesses to the object_tree_root (or + * object_phys_tree_root). The object_list is the main list holding the + * metadata (struct kmemleak_object) for the allocated memory blocks. + * The object_tree_root and object_phys_tree_root are red + * black trees used to look-up metadata based on a pointer to the * corresponding memory block. The object_phys_tree_root is for objects * allocated with physical address. The kmemleak_object structures are * added to the object_list and object_tree_root (or object_phys_tree_root) @@ -104,6 +105,9 @@ #include #include +/* akpm: stupid temporary hack */ +#define stack_depot_want_early_init stack_depot_request_early_init + /* * Kmemleak configuration and common defines. */ @@ -148,6 +152,7 @@ struct kmemleak_object { struct rcu_head rcu; /* object_list lockless traversal */ /* object usage count; object freed when use_count == 0 */ atomic_t use_count; + unsigned int del_state; /* deletion state */ unsigned long pointer; size_t size; /* pass surplus references to this pointer */ @@ -177,6 +182,11 @@ struct kmemleak_object { /* flag set for object allocated with physical address */ #define OBJECT_PHYS (1 << 4) +/* set when __remove_object() called */ +#define DELSTATE_REMOVED (1 << 0) +/* set to temporarily prevent deletion from object_list */ +#define DELSTATE_NO_DELETE (1 << 1) + #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ #define HEX_ROW_SIZE 16 @@ -571,7 +581,9 @@ static void __remove_object(struct kmemleak_object *object) rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? &object_phys_tree_root : &object_tree_root); - list_del_rcu(&object->object_list); + if (!(object->del_state & DELSTATE_NO_DELETE)) + list_del_rcu(&object->object_list); + object->del_state |= DELSTATE_REMOVED; } /* @@ -643,6 +655,7 @@ static void __create_object(unsigned long ptr, size_t size, object->count = 0; /* white color initially */ object->jiffies = jiffies; object->checksum = 0; + object->del_state = 0; /* task information */ if (in_hardirq()) { @@ -1472,22 +1485,30 @@ static void scan_gray_list(void) /* * Conditionally call resched() in an object iteration loop while making sure * that the given object won't go away without RCU read lock by performing a - * get_object() if !pinned. - * - * Return: false if can't do a cond_resched() due to get_object() failure - * true otherwise + * get_object() if necessaary. */ -static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +static void kmemleak_cond_resched(struct kmemleak_object *object) { - if (!pinned && !get_object(object)) - return false; + if (!get_object(object)) + return; /* Try next object */ + + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + goto unlock_put; /* Object removed */ + object->del_state |= DELSTATE_NO_DELETE; + raw_spin_unlock_irq(&kmemleak_lock); rcu_read_unlock(); cond_resched(); rcu_read_lock(); - if (!pinned) - put_object(object); - return true; + + raw_spin_lock_irq(&kmemleak_lock); + if (object->del_state & DELSTATE_REMOVED) + list_del_rcu(&object->object_list); + object->del_state &= ~DELSTATE_NO_DELETE; +unlock_put: + raw_spin_unlock_irq(&kmemleak_lock); + put_object(object); } /* @@ -1501,15 +1522,12 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop_cnt = 0; jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - bool obj_pinned = false; - raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1535,19 +1553,13 @@ static void kmemleak_scan(void) /* reset the reference count (whiten the object) */ object->count = 0; - if (color_gray(object) && get_object(object)) { + if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); - obj_pinned = true; - } raw_spin_unlock_irq(&object->lock); - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, obj_pinned)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); } rcu_read_unlock(); @@ -1614,14 +1626,9 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock @@ -1656,14 +1663,9 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); - loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { - /* - * Do a cond_resched() every 64k objects to avoid soft lockup. - */ - if (!(++loop_cnt & 0xffff) && - !kmemleak_cond_resched(object, false)) - loop_cnt--; /* Try again on next object */ + if (need_resched()) + kmemleak_cond_resched(object); /* * This is racy but we can save the overhead of lock/unlock diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 112dce135c7f6..f710257d68670 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -69,13 +69,15 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, { unsigned long entries[KMSAN_STACK_DEPTH]; unsigned int nr_entries; + depot_stack_handle_t handle; nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ flags &= ~__GFP_DIRECT_RECLAIM; - return __stack_depot_save(entries, nr_entries, extra, flags, true); + handle = __stack_depot_save(entries, nr_entries, flags, true); + return stack_depot_set_extra_bits(handle, extra); } /* Copy the metadata following the memmove() behavior. */ @@ -215,6 +217,7 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) u32 extra_bits; int depth; bool uaf; + depot_stack_handle_t handle; if (!id) return id; @@ -250,8 +253,9 @@ depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) * positives when __stack_depot_save() passes it to instrumented code. */ kmsan_internal_unpoison_memory(entries, sizeof(entries), false); - return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, - GFP_ATOMIC, true); + handle = __stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC, + true); + return stack_depot_set_extra_bits(handle, extra_bits); } void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 770fe02904f36..cf12e9616b243 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -38,7 +38,15 @@ get_shadow_origin_ptr(void *addr, u64 size, bool store) return ret; } +/* + * KMSAN instrumentation functions follow. They are not declared elsewhere in + * the kernel code, so they are preceded by prototypes, to silence + * -Wmissing-prototypes warnings. + */ + /* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) { @@ -47,6 +55,8 @@ struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); /* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size); struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) { @@ -59,12 +69,16 @@ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); * with fixed size. */ #define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ void *addr) \ { \ return get_shadow_origin_ptr(addr, size, /*store*/ false); \ } \ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr); \ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ void *addr) \ { \ @@ -86,6 +100,7 @@ DECLARE_METADATA_PTR_GETTER(8); * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure * the memory written to in these cases is also marked as initialized. */ +void __msan_instrument_asm_store(void *addr, uintptr_t size); void __msan_instrument_asm_store(void *addr, uintptr_t size) { unsigned long ua_flags; @@ -138,6 +153,7 @@ static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) } /* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n); void *__msan_memmove(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -162,6 +178,7 @@ void *__msan_memmove(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memmove); /* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n); void *__msan_memcpy(void *dst, const void *src, uintptr_t n) { depot_stack_handle_t origin; @@ -188,6 +205,7 @@ void *__msan_memcpy(void *dst, const void *src, uintptr_t n) EXPORT_SYMBOL(__msan_memcpy); /* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n); void *__msan_memset(void *dst, int c, uintptr_t n) { depot_stack_handle_t origin; @@ -217,6 +235,7 @@ EXPORT_SYMBOL(__msan_memset); * uninitialized value to memory. When reporting an error, KMSAN unrolls and * prints the whole chain of stores that preceded the use of this value. */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin); depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) { depot_stack_handle_t ret = 0; @@ -237,6 +256,7 @@ depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) EXPORT_SYMBOL(__msan_chain_origin); /* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr); void __msan_poison_alloca(void *address, uintptr_t size, char *descr) { depot_stack_handle_t handle; @@ -272,6 +292,7 @@ void __msan_poison_alloca(void *address, uintptr_t size, char *descr) EXPORT_SYMBOL(__msan_poison_alloca); /* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size); void __msan_unpoison_alloca(void *address, uintptr_t size) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -287,6 +308,7 @@ EXPORT_SYMBOL(__msan_unpoison_alloca); * Report that an uninitialized value with the given origin was used in a way * that constituted undefined behavior. */ +void __msan_warning(u32 origin); void __msan_warning(u32 origin) { if (!kmsan_enabled || kmsan_in_runtime()) @@ -303,6 +325,7 @@ EXPORT_SYMBOL(__msan_warning); * At the beginning of an instrumented function, obtain the pointer to * `struct kmsan_context_state` holding the metadata for function parameters. */ +struct kmsan_context_state *__msan_get_context_state(void); struct kmsan_context_state *__msan_get_context_state(void) { return &kmsan_get_context()->cstate; diff --git a/mm/ksm.c b/mm/ksm.c index 83e2f74ae7daa..04f1c8c2df11d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -214,6 +214,7 @@ struct ksm_rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -275,6 +276,9 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* The number of zero pages placed by KSM use_zero_pages */ +static unsigned long ksm_zero_pages_sharing; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -420,6 +424,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +enum break_ksm_pmd_entry_return_flag { + HAVE_KSM_PAGE = 1, + HAVE_ZERO_PAGE +}; + static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { @@ -427,6 +436,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex spinlock_t *ptl; pte_t *pte; int ret; + bool is_zero_page = false; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return 0; @@ -434,6 +444,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte_present(*pte)) { page = vm_normal_page(walk->vma, addr, *pte); + if (!page) + is_zero_page = is_zero_pfn(pte_pfn(*pte)); } else if (!pte_none(*pte)) { swp_entry_t entry = pte_to_swp_entry(*pte); @@ -444,7 +456,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex if (is_migration_entry(entry)) page = pfn_swap_entry_to_page(entry); } - ret = page && PageKsm(page); + + if (page && PageKsm(page)) + ret = HAVE_KSM_PAGE; + else if (is_zero_page) + ret = HAVE_ZERO_PAGE; + else + ret = 0; + pte_unmap_unlock(pte, ptl); return ret; } @@ -466,19 +485,22 @@ static const struct mm_walk_ops break_ksm_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, + bool unshare_zero_page) { vm_fault_t ret = 0; do { - int ksm_page; + int walk_result; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, + walk_result = walk_page_range_vma(vma, addr, addr + 1, &break_ksm_ops, NULL); - if (WARN_ON_ONCE(ksm_page < 0)) - return ksm_page; - if (!ksm_page) + if (WARN_ON_ONCE(walk_result < 0)) + return walk_result; + if (!walk_result) + return 0; + if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page) return 0; ret = handle_mm_fault(vma, addr, FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, @@ -539,7 +561,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, false); mmap_read_unlock(mm); } @@ -764,6 +786,33 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node, return NULL; } +/* + * Cleaning the rmap_item's ZERO_PAGE_FLAG + * This function will be called when unshare or writing on zero pages. + */ +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item) +{ + if (rmap_item->address & ZERO_PAGE_FLAG) { + ksm_zero_pages_sharing--; + rmap_item->mm->ksm_zero_pages_sharing--; + rmap_item->address &= PAGE_MASK; + } +} + +/* Only called when rmap_item is going to be freed */ +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item) +{ + struct vm_area_struct *vma; + + if (rmap_item->address & ZERO_PAGE_FLAG) { + vma = vma_lookup(rmap_item->mm, rmap_item->address); + if (vma && !ksm_test_exit(rmap_item->mm)) + break_ksm(vma, rmap_item->address, true); + } + /* Put at last. */ + clean_rmap_item_zero_flag(rmap_item); +} + /* * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. @@ -824,6 +873,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } } @@ -853,7 +903,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, false); } return err; } @@ -1057,8 +1107,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1164,7 +1213,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd_present(pmde) || pmd_trans_huge(pmde)) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2044,6 +2093,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, rmap_item->mm->ksm_merging_pages++; } +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item, + struct page *page) +{ + struct mm_struct *mm = rmap_item->mm; + int err = 0; + + /* + * It should not take ZERO_PAGE_FLAG because on one hand, + * get_next_rmap_item don't return zero pages' rmap_item. + * On the other hand, even if zero page was writen as + * anonymous page, rmap_item has been cleaned after + * stable_tree_search + */ + if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) { + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, rmap_item->address); + if (vma) { + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + if (!err) { + rmap_item->address |= ZERO_PAGE_FLAG; + ksm_zero_pages_sharing++; + rmap_item->mm->ksm_zero_pages_sharing++; + } + } else { + /* If the vma is out of date, we do not need to continue. */ + err = 0; + } + mmap_read_unlock(mm); + } + + return err; +} + /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can @@ -2055,7 +2140,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { - struct mm_struct *mm = rmap_item->mm; struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; @@ -2092,6 +2176,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } remove_rmap_item_from_tree(rmap_item); + clean_rmap_item_zero_flag(rmap_item); if (kpage) { if (PTR_ERR(kpage) == -EBUSY) @@ -2128,29 +2213,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ - if (ksm_use_zero_pages && (checksum == zero_checksum)) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - vma = find_mergeable_vma(mm, rmap_item->address); - if (vma) { - err = try_to_merge_one_page(vma, page, - ZERO_PAGE(rmap_item->address)); - } else { + if (ksm_use_zero_pages) { + if (checksum == zero_checksum) /* - * If the vma is out of date, we do not need to - * continue. + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. */ - err = 0; - } - mmap_read_unlock(mm); - /* - * In case of failure, the page was not really empty, so we - * need to continue. Otherwise we're done. - */ - if (!err) - return; + if (!try_to_merge_with_kernel_zero_page(rmap_item, page)) + return; } + tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { @@ -2214,23 +2286,37 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } } -static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, - struct ksm_rmap_item **rmap_list, - unsigned long addr) +static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr, + struct ksm_rmap_item **rmap_list) { - struct ksm_rmap_item *rmap_item; - while (*rmap_list) { - rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; if ((rmap_item->address & PAGE_MASK) == addr) return rmap_item; if (rmap_item->address > addr) break; *rmap_list = rmap_item->rmap_list; + /* Running here indicates it's vma has been UNMERGEABLE */ remove_rmap_item_from_tree(rmap_item); + unshare_zero_pages(rmap_item); free_rmap_item(rmap_item); } + return NULL; +} + +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, + unsigned long addr) +{ + struct ksm_rmap_item *rmap_item; + + /* lookup if we have a old rmap_item matching the addr*/ + rmap_item = try_to_get_old_rmap_item(addr, rmap_list); + if (rmap_item) + return rmap_item; + + /* Need to allocate a new rmap_item */ rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ @@ -2337,6 +2423,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } if (is_zone_device_page(*page)) goto next_page; + if (is_zero_pfn(page_to_pfn(*page))) { + /* + * To monitor ksm zero pages which becomes non-anonymous, + * we have to save each rmap_item of zero pages by + * try_to_get_old_rmap_item() walking on + * ksm_scan.rmap_list, otherwise their rmap_items will be + * freed by the next turn of get_next_rmap_item(). The + * function get_next_rmap_item() will free all "skipped" + * rmap_items because it thinks its areas as UNMERGEABLE. + */ + rmap_item = try_to_get_old_rmap_item(ksm_scan.address, + ksm_scan.rmap_list); + if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG)) + ksm_scan.rmap_list = &rmap_item->rmap_list; + goto next_page; + } if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); @@ -3134,6 +3236,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t zero_pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%ld\n", ksm_zero_pages_sharing); +} +KSM_ATTR_RO(zero_pages_sharing); + static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3189,6 +3298,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &zero_pages_sharing_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/madvise.c b/mm/madvise.c index 6bd4f94627dba..200e7d6ee5ae1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -142,6 +142,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; + VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { *prev = vma; @@ -149,8 +150,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), + *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_name); if (*prev) { vma = *prev; @@ -160,17 +161,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(mm, vma, start, 1); + error = split_vma(&vmi, vma, start, 1); if (error) return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) - return -ENOMEM; - error = __split_vma(mm, vma, end, 0); + error = split_vma(&vmi, vma, end, 0); if (error) return error; } @@ -179,7 +176,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, /* * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) @@ -617,7 +614,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; struct folio *folio; - struct page *page; int nr_swap = 0; unsigned long next; @@ -658,10 +654,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = vm_normal_page(vma, addr, ptent); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, ptent); + if (!folio || folio_is_zone_device(folio)) continue; - folio = page_folio(page); /* * If pmd isn't transhuge but the folio is large and @@ -765,7 +760,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 175e424b9ab1b..e1eb33f490592 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -191,7 +191,7 @@ static int wp_clean_pre_vma(unsigned long start, unsigned long end, wpwalk->tlbflush_end = start; mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, - walk->vma, walk->mm, start, end); + walk->mm, start, end); mmu_notifier_invalidate_range_start(&wpwalk->range); flush_cache_range(walk->vma, start, end); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index faeea84964aa8..17335459d8dc7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -350,21 +350,19 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page - * @page: page of interest + * mem_cgroup_css_from_folio - css of the memcg associated with a folio + * @folio: folio of interest * * If memcg is bound to the default hierarchy, css of the memcg associated - * with @page is returned. The returned css remains associated with @page + * with @folio is returned. The returned css remains associated with @folio * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio) { - struct mem_cgroup *memcg; - - memcg = page_memcg(page); + struct mem_cgroup *memcg = folio_memcg(folio); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@ -478,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) struct mem_cgroup_tree_per_node *mctz; if (lru_gen_enabled()) { - struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - - /* see the comment on MEMCG_NR_GENS */ - if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); - + if (soft_limit_excess(memcg)) + lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec); return; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6bf07345ea2c0..db85c2d37f70a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) memblk_nr_poison_sub(pfn, i); } +/** + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. + * @_name: name of the file in the per NUMA sysfs directory. + */ +#define MF_ATTR_RO(_name) \ +static ssize_t _name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct memory_failure_stats *mf_stats = \ + &NODE_DATA(dev->id)->mf_stats; \ + return sprintf(buf, "%lu\n", mf_stats->_name); \ +} \ +static DEVICE_ATTR_RO(_name) + +MF_ATTR_RO(total); +MF_ATTR_RO(ignored); +MF_ATTR_RO(failed); +MF_ATTR_RO(delayed); +MF_ATTR_RO(recovered); + +static struct attribute *memory_failure_attr[] = { + &dev_attr_total.attr, + &dev_attr_ignored.attr, + &dev_attr_failed.attr, + &dev_attr_delayed.attr, + &dev_attr_recovered.attr, + NULL, +}; + +const struct attribute_group memory_failure_attr_group = { + .name = "memory_failure", + .attrs = memory_failure_attr, +}; + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -1192,6 +1227,39 @@ static struct page_state error_states[] = { #undef slab #undef reserved +static void update_per_node_mf_stats(unsigned long pfn, + enum mf_result result) +{ + int nid = MAX_NUMNODES; + struct memory_failure_stats *mf_stats = NULL; + + nid = pfn_to_nid(pfn); + if (unlikely(nid < 0 || nid >= MAX_NUMNODES)) { + WARN_ONCE(1, "Memory failure: pfn=%#lx, invalid nid=%d", pfn, nid); + return; + } + + mf_stats = &NODE_DATA(nid)->mf_stats; + switch (result) { + case MF_IGNORED: + ++mf_stats->ignored; + break; + case MF_FAILED: + ++mf_stats->failed; + break; + case MF_DELAYED: + ++mf_stats->delayed; + break; + case MF_RECOVERED: + ++mf_stats->recovered; + break; + default: + WARN_ONCE(1, "Memory failure: mf_result=%d is not properly handled", result); + break; + } + ++mf_stats->total; +} + /* * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). @@ -1202,6 +1270,9 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(pfn); + + update_per_node_mf_stats(pfn, result); + pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1257,28 +1328,28 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) static int __get_hwpoison_page(struct page *page, unsigned long flags) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, false); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); if (hugetlb) return ret; /* - * This check prevents from calling get_page_unless_zero() for any - * unsupported type of page in order to reduce the risk of unexpected - * races caused by taking a page refcount. + * This check prevents from calling folio_try_get() for any + * unsupported type of folio in order to reduce the risk of unexpected + * races caused by taking a folio refcount. */ - if (!HWPoisonHandlable(head, flags)) + if (!HWPoisonHandlable(&folio->page, flags)) return -EBUSY; - if (get_page_unless_zero(head)) { - if (head == compound_head(page)) + if (folio_try_get(folio)) { + if (folio == page_folio(page)) return 1; pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); - put_page(head); + folio_put(folio); } return 0; @@ -1347,11 +1418,11 @@ static int get_any_page(struct page *p, unsigned long flags) static int __get_unpoison_page(struct page *page) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb, true); + ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); if (hugetlb) return ret; @@ -1695,18 +1766,18 @@ struct raw_hwp_page { struct page *page; }; -static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +static inline struct llist_head *raw_hwp_list_head(struct folio *folio) { - return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; + return (struct llist_head *)&folio->_hugetlb_hwpoison; } -static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { struct llist_head *head; struct llist_node *t, *tnode; unsigned long count = 0; - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1721,21 +1792,21 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) return count; } -static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; struct llist_node *t, *tnode; - int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* * Once the hwpoison hugepage has lost reliable raw error info, * there is little meaning to keep additional error info precisely, * so skip to add additional raw error info. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; - head = raw_hwp_list_head(hpage); + head = raw_hwp_list_head(folio); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); @@ -1756,41 +1827,41 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * hwpoisoned subpages, and we need refuse to free/dissolve * this hwpoisoned hugepage. */ - SetHPageRawHwpUnreliable(hpage); + folio_set_hugetlb_raw_hwp_unreliable(folio); /* - * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * Once hugetlb_raw_hwp_unreliable is set, raw_hwp_page is not * used any more, so free it. */ - __free_raw_hwp_pages(hpage, false); + __folio_free_raw_hwp(folio, false); } return ret; } -static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +static unsigned long folio_free_raw_hwp(struct folio *folio, bool move_flag) { /* - * HPageVmemmapOptimized hugepages can't be freed because struct + * hugetlb_vmemmap_optimized hugepages can't be freed because struct * pages for tail pages are required but they don't exist. */ - if (move_flag && HPageVmemmapOptimized(hpage)) + if (move_flag && folio_test_hugetlb_vmemmap_optimized(folio)) return 0; /* - * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * hugetlb_raw_hwp_unreliable hugepages shouldn't be unpoisoned by * definition. */ - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return 0; - return __free_raw_hwp_pages(hpage, move_flag); + return __folio_free_raw_hwp(folio, move_flag); } -void hugetlb_clear_page_hwpoison(struct page *hpage) +void folio_clear_hugetlb_hwpoison(struct folio *folio) { - if (HPageRawHwpUnreliable(hpage)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - ClearPageHWPoison(hpage); - free_raw_hwp_pages(hpage, true); + folio_clear_hwpoison(folio); + folio_free_raw_hwp(folio, true); } /* @@ -1807,20 +1878,20 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); int ret = 2; /* fallback to normal page handling */ bool count_increased = false; - if (!PageHeadHuge(head)) + if (!folio_test_hugetlb(folio)) goto out; if (flags & MF_COUNT_INCREASED) { ret = 1; count_increased = true; - } else if (HPageFreed(head)) { + } else if (folio_test_hugetlb_freed(folio)) { ret = 0; - } else if (HPageMigratable(head)) { - ret = get_page_unless_zero(head); + } else if (folio_test_hugetlb_migratable(folio)) { + ret = folio_try_get(folio); if (ret) count_increased = true; } else { @@ -1829,24 +1900,24 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, goto out; } - if (hugetlb_set_page_hwpoison(head, page)) { + if (folio_set_hugetlb_hwpoison(folio, page)) { ret = -EHWPOISON; goto out; } /* - * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * Clearing hugetlb_migratable for hwpoisoned hugepages to prevent them * from being migrated by memory hotremove. */ - if (count_increased && HPageMigratable(head)) { - ClearHPageMigratable(head); + if (count_increased && folio_test_hugetlb_migratable(folio)) { + folio_clear_hugetlb_migratable(folio); *migratable_cleared = true; } return ret; out: if (count_increased) - put_page(head); + folio_put(folio); return ret; } @@ -1860,7 +1931,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb { int res; struct page *p = pfn_to_page(pfn); - struct page *head; + struct folio *folio; unsigned long page_flags; bool migratable_cleared = false; @@ -1873,8 +1944,8 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb } else if (res == -EHWPOISON) { pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { - head = compound_head(p); - res = kill_accessing_process(current, page_to_pfn(head), flags); + folio = page_folio(p); + res = kill_accessing_process(current, folio_pfn(folio), flags); } return res; } else if (res == -EBUSY) { @@ -1885,16 +1956,16 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } - head = compound_head(p); - lock_page(head); + folio = page_folio(p); + folio_lock(folio); if (hwpoison_filter(p)) { - hugetlb_clear_page_hwpoison(head); + folio_clear_hugetlb_hwpoison(folio); if (migratable_cleared) - SetHPageMigratable(head); - unlock_page(head); + folio_set_hugetlb_migratable(folio); + folio_unlock(folio); if (res == 1) - put_page(head); + folio_put(folio); return -EOPNOTSUPP; } @@ -1903,7 +1974,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb * or demotion can be prevented by PageHWPoison flag. */ if (res == 0) { - unlock_page(head); + folio_unlock(folio); if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; @@ -1913,10 +1984,10 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = head->flags; + page_flags = folio->flags; - if (!hwpoison_user_mappings(p, pfn, flags, head)) { - unlock_page(head); + if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) { + folio_unlock(folio); return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } @@ -1929,7 +2000,7 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int * return 0; } -static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +static inline unsigned long folio_free_raw_hwp(struct folio *folio, bool flag) { return 0; } @@ -2167,7 +2238,7 @@ int memory_failure(unsigned long pfn, int flags) } /* - * __munlock_pagevec may clear a writeback page's LRU flag without + * __munlock_folio() may clear a writeback page's LRU flag without * page_lock. We need wait writeback completion for this page or it * may trigger vfs BUG while evict inode. */ @@ -2335,7 +2406,7 @@ core_initcall(memory_failure_init); */ int unpoison_memory(unsigned long pfn) { - struct page *page; + struct folio *folio; struct page *p; int ret = -EBUSY; unsigned long count = 1; @@ -2347,7 +2418,7 @@ int unpoison_memory(unsigned long pfn) return -ENXIO; p = pfn_to_page(pfn); - page = compound_head(p); + folio = page_folio(p); mutex_lock(&mf_mutex); @@ -2358,44 +2429,44 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!PageHWPoison(p)) { + if (!folio_test_hwpoison(folio)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_count(page) > 1) { + if (folio_ref_count(folio) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapped(page)) { + if (folio_mapped(folio)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (page_mapping(page)) { + if (folio_mapping(folio)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); goto unlock_mutex; } - if (PageSlab(page) || PageTable(page) || PageReserved(page)) + if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; goto unlock_mutex; } } - ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; + ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { ret = put_page_back_buddy(p) ? 0 : -EBUSY; @@ -2405,17 +2476,17 @@ int unpoison_memory(unsigned long pfn) } else { if (PageHuge(p)) { huge = true; - count = free_raw_hwp_pages(page, false); + count = folio_free_raw_hwp(folio, false); if (count == 0) { ret = -EBUSY; - put_page(page); + folio_put(folio); goto unlock_mutex; } } - put_page(page); + folio_put(folio); if (TestClearPageHWPoison(p)) { - put_page(page); + folio_put(folio); ret = 0; } } @@ -2437,7 +2508,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool isolated = false; if (PageHuge(page)) { - isolated = !isolate_hugetlb(page, pagelist); + isolated = !isolate_hugetlb(page_folio(page), pagelist); } else { bool lru = !__PageMovable(page); diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c734658c62424..e593e56e530b7 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -211,8 +211,8 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty ret = device_register(&new_memtier->dev); if (ret) { - list_del(&memtier->list); - put_device(&memtier->dev); + list_del(&new_memtier->list); + put_device(&new_memtier->dev); return ERR_PTR(ret); } memtier = new_memtier; diff --git a/mm/memory.c b/mm/memory.c index 635a85e7c3f0e..7a04a1130ec15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -863,13 +863,13 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc, struct page *page) + struct folio **prealloc, struct page *page) { - struct page *new_page; + struct folio *new_folio; pte_t pte; - new_page = *prealloc; - if (!new_page) + new_folio = *prealloc; + if (!new_folio) return -EAGAIN; /* @@ -877,14 +877,14 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * over and copy the page & arm it. */ *prealloc = NULL; - copy_user_highpage(new_page, page, addr, src_vma); - __SetPageUptodate(new_page); - page_add_new_anon_rmap(new_page, dst_vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, dst_vma); - rss[mm_counter(new_page)]++; + copy_user_highpage(&new_folio->page, page, addr, src_vma); + __folio_mark_uptodate(new_folio); + folio_add_new_anon_rmap(new_folio, dst_vma, addr); + folio_add_lru_vma(new_folio, dst_vma); + rss[MM_ANONPAGES]++; /* All done, just insert the new page copy in the child */ - pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ @@ -900,33 +900,36 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, - struct page **prealloc) + struct folio **prealloc) { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; + struct folio *folio; page = vm_normal_page(src_vma, addr, pte); - if (page && PageAnon(page)) { + if (page) + folio = page_folio(page); + if (page && folio_test_anon(folio)) { /* * If this page may have been pinned by the parent process, * copy the page immediately for the child so that we'll always * guarantee the pinned page won't be randomly replaced in the * future. */ - get_page(page); + folio_get(folio); if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { - /* Page maybe pinned, we have to copy. */ - put_page(page); + /* Page may be pinned, we have to copy. */ + folio_put(folio); return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } - rss[mm_counter(page)]++; + rss[MM_ANONPAGES]++; } else if (page) { - get_page(page); + folio_get(folio); page_dup_file_rmap(page, false); - rss[mm_counter(page)]++; + rss[mm_counter_file(page)]++; } /* @@ -937,7 +940,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } - VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); + VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page)); /* * If it's a shared mapping, mark it clean in @@ -954,23 +957,22 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return 0; } -static inline struct page * -page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, - unsigned long addr) +static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm, + struct vm_area_struct *vma, unsigned long addr) { - struct page *new_page; + struct folio *new_folio; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + if (!new_folio) return NULL; - if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { - put_page(new_page); + if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) { + folio_put(new_folio); return NULL; } - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - return new_page; + return new_folio; } static int @@ -986,7 +988,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; - struct page *prealloc = NULL; + struct folio *prealloc = NULL; again: progress = 0; @@ -1056,7 +1058,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, * will allocate page according to address). This * could only happen if one pinned pte changed. */ - put_page(prealloc); + folio_put(prealloc); prealloc = NULL; } progress += 8; @@ -1093,7 +1095,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, goto again; out: if (unlikely(prealloc)) - put_page(prealloc); + folio_put(prealloc); return ret; } @@ -1266,7 +1268,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, src_vma, src_mm, addr, end); + 0, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as @@ -1611,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb, static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) + struct zap_details *details, bool mm_wr_locked) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; @@ -1626,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) - untrack_pfn(vma, 0, 0); + untrack_pfn(vma, 0, 0, mm_wr_locked); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { @@ -1673,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr) + unsigned long end_addr, bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1683,11 +1685,12 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, }; MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); do { - unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details, + mm_wr_locked); } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1709,7 +1712,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); if (is_vm_hugetlb_page(vma)) adjust_range_if_pmd_sharing_possible(vma, &range.start, @@ -1721,7 +1724,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, * unmap 'address-end' not 'range.start-range.end' as range * could have been expanded for hugetlb pmd sharing. */ - unmap_single_vma(&tlb, vma, address, end, details); + unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -1926,7 +1929,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); @@ -1984,7 +1987,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); } return insert_page(vma, addr, page, vma->vm_page_prot); } @@ -2450,7 +2453,7 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -2490,7 +2493,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -3043,8 +3046,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; - struct page *old_page = vmf->page; - struct page *new_page = NULL; + struct folio *old_folio = NULL; + struct folio *new_folio = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; @@ -3052,21 +3055,22 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_start(); + if (vmf->page) + old_folio = page_folio(vmf->page); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, - vmf->address); - if (!new_page) + new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!new_folio) goto oom; } else { - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (!new_page) + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + vmf->address, false); + if (!new_folio) goto oom; - ret = __wp_page_copy_user(new_page, old_page, vmf); + ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); if (ret) { /* * COW failed, if the fault was solved by other, @@ -3075,23 +3079,23 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * from the second attempt. * The -EHWPOISON case will not be retried. */ - put_page(new_page); - if (old_page) - put_page(old_page); + folio_put(new_folio); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } - kmsan_copy_page_meta(new_page, old_page); + kmsan_copy_page_meta(&new_folio->page, vmf->page); } - if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) + if (mem_cgroup_charge(new_folio, mm, GFP_KERNEL)) goto oom_free_new; - cgroup_throttle_swaprate(new_page, GFP_KERNEL); + cgroup_throttle_swaprate(&new_folio->page, GFP_KERNEL); - __SetPageUptodate(new_page); + __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -3101,16 +3105,16 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { - if (old_page) { - if (!PageAnon(old_page)) { - dec_mm_counter(mm, mm_counter_file(old_page)); + if (old_folio) { + if (!folio_test_anon(old_folio)) { + dec_mm_counter(mm, mm_counter_file(&old_folio->page)); inc_mm_counter(mm, MM_ANONPAGES); } } else { inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); - entry = mk_pte(new_page, vma->vm_page_prot); + entry = mk_pte(&new_folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (unlikely(unshare)) { if (pte_soft_dirty(vmf->orig_pte)) @@ -3129,8 +3133,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); - page_add_new_anon_rmap(new_page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_new_anon_rmap(new_folio, vma, vmf->address); + folio_add_lru_vma(new_folio, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3139,7 +3143,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); - if (old_page) { + if (old_folio) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another @@ -3162,18 +3166,18 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page, vma, false); + page_remove_rmap(vmf->page, vma, false); } /* Free the old page.. */ - new_page = old_page; + new_folio = old_folio; page_copied = 1; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); } - if (new_page) - put_page(new_page); + if (new_folio) + folio_put(new_folio); pte_unmap_unlock(vmf->pte, vmf->ptl); /* @@ -3181,19 +3185,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); - if (old_page) { + if (old_folio) { if (page_copied) - free_swap_cache(old_page); - put_page(old_page); + free_swap_cache(&old_folio->page); + folio_put(old_folio); } delayacct_wpcopy_end(); return 0; oom_free_new: - put_page(new_page); + folio_put(new_folio); oom: - if (old_page) - put_page(old_page); + if (old_folio) + folio_put(old_folio); delayacct_wpcopy_end(); return VM_FAULT_OOM; @@ -3561,7 +3565,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); mmu_notifier_invalidate_range_start(&range); @@ -3867,10 +3871,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { - /* - * Note that pte_swp_exclusive() == false for architectures - * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. - */ exclusive = pte_swp_exclusive(vmf->orig_pte); if (folio != swapcache) { /* @@ -4001,7 +4001,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page; + struct folio *folio; vm_fault_t ret = 0; pte_t entry; @@ -4051,22 +4051,22 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, vmf->address); - if (!page) + folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); + if (!folio) goto oom; - if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; - cgroup_throttle_swaprate(page, GFP_KERNEL); + cgroup_throttle_swaprate(&folio->page, GFP_KERNEL); /* - * The memory barrier inside __SetPageUptodate makes sure that + * The memory barrier inside __folio_mark_uptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ - __SetPageUptodate(page); + __folio_mark_uptodate(folio); - entry = mk_pte(page, vma->vm_page_prot); + entry = mk_pte(&folio->page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -4085,13 +4085,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); - put_page(page); + folio_put(folio); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_new_anon_rmap(folio, vma, vmf->address); + folio_add_lru_vma(folio, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -4101,10 +4101,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: - put_page(page); + folio_put(folio); goto unlock; oom_free_page: - put_page(page); + folio_put(folio); oom: return VM_FAULT_OOM; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fd40f7e9f1763..a1e8c3e9ab080 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_hugetlb(head, &source); + isolate_hugetlb(folio, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72142fbe76528..0919c7a719d41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -414,7 +414,7 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); struct queue_pages { @@ -427,36 +427,36 @@ struct queue_pages { }; /* - * Check if the page's nid is in qp->nmask. + * Check if the folio's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ -static inline bool queue_pages_required(struct page *page, +static inline bool queue_folio_required(struct folio *folio, struct queue_pages *qp) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } /* - * queue_pages_pmd() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. huge zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing page was already on a node that does not follow the + * existing folio was already on a node that does not follow the * policy. */ -static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags; @@ -464,19 +464,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; goto unlock; } - page = pmd_page(*pmd); - if (is_huge_zero_page(page)) { + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; goto unlock; } - if (!queue_pages_required(page, qp)) + if (!queue_folio_required(folio, qp)) goto unlock; flags = qp->flags; - /* go to thp migration */ + /* go to folio migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || - migrate_page_add(page, qp->pagelist, flags)) { + migrate_folio_add(folio, qp->pagelist, flags)) { ret = 1; goto unlock; } @@ -491,19 +491,19 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * - * queue_pages_pte_range() has three possible return values: - * 0 - pages are placed on the right node or queued successfully, or + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or * special page is met, i.e. zero page. - * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were + * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. - * -EIO - only MPOL_MF_STRICT was specified and an existing page was already + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ -static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct page *page; + struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; bool has_unmovable = false; @@ -512,7 +512,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) - return queue_pages_pmd(pmd, ptl, addr, end, walk); + return queue_folios_pmd(pmd, ptl, addr, end, walk); if (pmd_trans_unstable(pmd)) return 0; @@ -521,16 +521,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; /* - * vm_normal_page() filters out zero pages, but there might - * still be PageReserved pages to skip, perhaps in a VDSO. + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. */ - if (PageReserved(page)) + if (folio_test_reserved(folio)) continue; - if (!queue_pages_required(page, qp)) + if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ @@ -544,7 +544,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ - if (migrate_page_add(page, qp->pagelist, flags)) + if (migrate_folio_add(folio, qp->pagelist, flags)) has_unmovable = true; } else break; @@ -558,7 +558,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, return addr != end ? -EIO : 0; } -static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, +static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -566,7 +566,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); - struct page *page; + struct folio *folio; spinlock_t *ptl; pte_t entry; @@ -574,13 +574,13 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; - page = pte_page(entry); - if (!queue_pages_required(page, qp)) + folio = pfn_folio(pte_pfn(entry)); + if (!queue_folio_required(folio, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* - * STRICT alone means only detecting misplaced page and no + * STRICT alone means only detecting misplaced folio and no * need to further check other vma. */ ret = -EIO; @@ -591,21 +591,28 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. - * Detecting misplaced page but allow migrating pages which + * Detecting misplaced folio but allow migrating folios which * have been queued. */ ret = 1; goto unlock; } - /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + /* + * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it + * is shared it is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) { - if (isolate_hugetlb(page, qp->pagelist) && + if (isolate_hugetlb(folio, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* - * Failed to isolate page but allow migrating pages + * Failed to isolate folio but allow migrating pages * which have been queued. */ ret = 1; @@ -703,8 +710,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } static const struct mm_walk_ops queue_pages_walk_ops = { - .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, }; @@ -787,24 +794,21 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_prev(&mas, 0); - if (unlikely(!prev)) - mas_set(&mas, start); - - vma = mas_find(&mas, end - 1); + prev = vma_prev(&vmi); + vma = vma_find(&vmi, end); if (WARN_ON(!vma)) return 0; if (start > vma->vm_start) prev = vma; - for (; vma; vma = mas_next(&mas, end - 1)) { + do { unsigned long vmstart = max(start, vma->vm_start); unsigned long vmend = min(end, vma->vm_end); @@ -813,29 +817,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { - /* vma_merge() invalidated the mas */ - mas_pause(&mas); vma = prev; goto replace; } if (vma->vm_start != vmstart) { - err = split_vma(vma->vm_mm, vma, vmstart, 1); + err = split_vma(&vmi, vma, vmstart, 1); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } if (vma->vm_end != vmend) { - err = split_vma(vma->vm_mm, vma, vmend, 0); + err = split_vma(&vmi, vma, vmend, 0); if (err) goto out; - /* split_vma() invalidated the mas */ - mas_pause(&mas); } replace: err = vma_replace_policy(vma, new_pol); @@ -843,7 +841,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, goto out; next: prev = vma; - } + } for_each_vma_range(vmi, vma, end); out: return err; @@ -1023,27 +1021,28 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -/* - * page migration, thp tail pages can be passed. - */ -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - struct page *head = compound_head(page); /* - * Avoid migrating a page that is shared with others. + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } else if (flags & MPOL_MF_STRICT) { /* - * Non-movable page may reach here. And, there may be - * temporary off LRU pages or non-LRU movable pages. - * Treat them as unmovable pages since they can't be + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ @@ -1219,9 +1218,11 @@ static struct page *new_page(struct page *page, unsigned long start) break; } - if (folio_test_hugetlb(src)) - return alloc_huge_page_vma(page_hstate(&src->page), + if (folio_test_hugetlb(src)) { + dst = alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); + return &dst->page; + } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; @@ -1235,7 +1236,7 @@ static struct page *new_page(struct page *page, unsigned long start) } #else -static int migrate_page_add(struct page *page, struct list_head *pagelist, +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { return -EIO; diff --git a/mm/memremap.c b/mm/memremap.c index 08cbf54fe0370..2f88f43d4a017 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -129,7 +129,7 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) } mem_hotplug_done(); - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); pgmap_array_delete(range); } @@ -276,7 +276,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, if (!is_private) kasan_remove_zero_shadow(__va(range->start), range_len(range)); err_kasan: - untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range), true); err_pfn_remap: pgmap_array_delete(range); return error; diff --git a/mm/migrate.c b/mm/migrate.c index 98de7ce2b5768..5b40b9040ba60 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -60,6 +60,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { + struct folio *folio = folio_get_nontail_page(page); const struct movable_operations *mops; /* @@ -71,11 +72,11 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * the put_page() at the end of this block will take care of * release this page, thus avoiding a nasty leakage. */ - if (unlikely(!get_page_unless_zero(page))) + if (!folio) goto out; - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ smp_rmb(); /* @@ -83,12 +84,12 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * we use non-atomic bitops on newly allocated page flags so * unconditionally grabbing the lock ruins page's owner side. */ - if (unlikely(!__PageMovable(page))) - goto out_putpage; + if (unlikely(!__folio_test_movable(folio))) + goto out_putfolio; /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ smp_rmb(); - if (unlikely(PageSlab(page))) - goto out_putpage; + if (unlikely(folio_test_slab(folio))) + goto out_putfolio; /* * As movable pages are not isolated from LRU lists, concurrent @@ -101,39 +102,39 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) * lets be sure we have the page lock * before proceeding with the movable page isolation steps. */ - if (unlikely(!trylock_page(page))) - goto out_putpage; + if (unlikely(!folio_trylock(folio))) + goto out_putfolio; - if (!PageMovable(page) || PageIsolated(page)) + if (!folio_test_movable(folio) || folio_test_isolated(folio)) goto out_no_isolated; - mops = page_movable_ops(page); - VM_BUG_ON_PAGE(!mops, page); + mops = folio_movable_ops(folio); + VM_BUG_ON_FOLIO(!mops, folio); - if (!mops->isolate_page(page, mode)) + if (!mops->isolate_page(&folio->page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ - WARN_ON_ONCE(PageIsolated(page)); - SetPageIsolated(page); - unlock_page(page); + WARN_ON_ONCE(folio_test_isolated(folio)); + folio_set_isolated(folio); + folio_unlock(folio); return 0; out_no_isolated: - unlock_page(page); -out_putpage: - put_page(page); + folio_unlock(folio); +out_putfolio: + folio_put(folio); out: return -EBUSY; } -static void putback_movable_page(struct page *page) +static void putback_movable_folio(struct folio *folio) { - const struct movable_operations *mops = page_movable_ops(page); + const struct movable_operations *mops = folio_movable_ops(folio); - mops->putback_page(page); - ClearPageIsolated(page); + mops->putback_page(&folio->page); + folio_clear_isolated(folio); } /* @@ -146,33 +147,33 @@ static void putback_movable_page(struct page *page) */ void putback_movable_pages(struct list_head *l) { - struct page *page; - struct page *page2; + struct folio *folio; + struct folio *folio2; - list_for_each_entry_safe(page, page2, l, lru) { - if (unlikely(PageHuge(page))) { - putback_active_hugepage(page); + list_for_each_entry_safe(folio, folio2, l, lru) { + if (unlikely(folio_test_hugetlb(folio))) { + folio_putback_active_hugetlb(folio); continue; } - list_del(&page->lru); + list_del(&folio->lru); /* - * We isolated non-lru movable page so here we can use - * __PageMovable because LRU page's mapping cannot have + * We isolated non-lru movable folio so here we can use + * __PageMovable because LRU folio's mapping cannot have * PAGE_MAPPING_MOVABLE. */ - if (unlikely(__PageMovable(page))) { - VM_BUG_ON_PAGE(!PageIsolated(page), page); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); + if (unlikely(__folio_test_movable(folio))) { + VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); + folio_lock(folio); + if (folio_test_movable(folio)) + putback_movable_folio(folio); else - ClearPageIsolated(page); - unlock_page(page); - put_page(page); + folio_clear_isolated(folio); + folio_unlock(folio); + folio_put(folio); } else { - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); - putback_lru_page(page); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -folio_nr_pages(folio)); + folio_putback_lru(folio); } } } @@ -265,7 +266,7 @@ static bool remove_migration_pte(struct folio *folio, set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), compound_order(new)); @@ -990,7 +991,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - mops = page_movable_ops(&src->page); + mops = folio_movable_ops(src); rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); @@ -1298,7 +1299,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1383,7 +1384,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - putback_active_hugepage(hpage); + folio_putback_active_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1395,7 +1396,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (put_new_page) put_new_page(new_hpage, private); else - putback_active_hugepage(new_hpage); + folio_putback_active_hugetlb(dst); return rc; } @@ -1663,6 +1664,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct migration_target_control *mtc; gfp_t gfp_mask; unsigned int order = 0; + struct folio *hugetlb_folio = NULL; struct folio *new_folio = NULL; int nid; int zidx; @@ -1677,7 +1679,9 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); - return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + hugetlb_folio = alloc_hugetlb_folio_nodemask(h, nid, + mtc->nmask, gfp_mask); + return &hugetlb_folio->page; } if (folio_test_large(folio)) { @@ -1773,7 +1777,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - err = isolate_hugetlb(page, pagelist); + err = isolate_hugetlb(page_folio(page), pagelist); if (!err) err = 1; } diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 721b2365dbca9..6c3740318a98c 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -306,7 +306,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * private page mappings that won't be migrated. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, - migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->vma->vm_mm, migrate->start, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); @@ -733,7 +733,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, notified = true; mmu_notifier_range_init_owner(&range, - MMU_NOTIFY_MIGRATE, 0, migrate->vma, + MMU_NOTIFY_MIGRATE, 0, migrate->vma->vm_mm, addr, migrate->end, migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mlock.c b/mm/mlock.c index 7032f6dd0ce19..ed49459e343e5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -28,12 +28,12 @@ #include "internal.h" -struct mlock_pvec { +struct mlock_fbatch { local_lock_t lock; - struct pagevec vec; + struct folio_batch fbatch; }; -static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = { +static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -48,192 +48,192 @@ bool can_do_mlock(void) EXPORT_SYMBOL(can_do_mlock); /* - * Mlocked pages are marked with PageMlocked() flag for efficient testing + * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * - * An mlocked page [PageMlocked(page)] is unevictable. As such, it will - * be placed on the LRU "unevictable" list, rather than the [in]active lists. - * The unevictable list is an LRU sibling list to the [in]active lists. - * PageUnevictable is set to indicate the unevictable state. + * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it + * will be ostensibly placed on the LRU "unevictable" list (actually no such + * list exists), rather than the [in]active lists. PG_unevictable is set to + * indicate the unevictable state. */ -static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) return lruvec; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (unlikely(page_evictable(page))) { + if (unlikely(folio_evictable(folio))) { /* - * This is a little surprising, but quite possible: - * PageMlocked must have got cleared already by another CPU. - * Could this page be on the Unevictable LRU? I'm not sure, - * but move it now if so. + * This is a little surprising, but quite possible: PG_mlocked + * must have got cleared already by another CPU. Could this + * folio be unevictable? I'm not sure, but move it now if so. */ - if (PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + if (folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGRESCUED, - thp_nr_pages(page)); + folio_nr_pages(folio)); } goto out; } - if (PageUnevictable(page)) { - if (PageMlocked(page)) - page->mlock_count++; + if (folio_test_unevictable(folio)) { + if (folio_test_mlocked(folio)) + folio->mlock_count++; goto out; } - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - add_page_to_lru_list(page, lruvec); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ - if (unlikely(page_evictable(page))) + if (unlikely(folio_evictable(folio))) goto out; - SetPageUnevictable(page); - page->mlock_count = !!PageMlocked(page); - __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); + folio_set_unevictable(folio); + folio->mlock_count = !!folio_test_mlocked(folio); + __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: - add_page_to_lru_list(page, lruvec); - SetPageLRU(page); + lruvec_add_folio(lruvec, folio); + folio_set_lru(folio); return lruvec; } -static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) +static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { - int nr_pages = thp_nr_pages(page); + int nr_pages = folio_nr_pages(folio); bool isolated = false; - if (!TestClearPageLRU(page)) + if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; - lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + lruvec = folio_lruvec_relock_irq(folio, lruvec); - if (PageUnevictable(page)) { + if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ - if (page->mlock_count) - page->mlock_count--; - if (page->mlock_count) + if (folio->mlock_count) + folio->mlock_count--; + if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: - if (TestClearPageMlocked(page)) { - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (isolated || !PageUnevictable(page)) + if (folio_test_clear_mlocked(folio)) { + __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); + if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } - /* page_evictable() has to be checked *after* clearing Mlocked */ - if (isolated && PageUnevictable(page) && page_evictable(page)) { - del_page_from_lru_list(page, lruvec); - ClearPageUnevictable(page); - add_page_to_lru_list(page, lruvec); + /* folio_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) - SetPageLRU(page); + folio_set_lru(folio); return lruvec; } /* - * Flags held in the low bits of a struct page pointer on the mlock_pvec. + * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ -#define LRU_PAGE 0x1 -#define NEW_PAGE 0x2 -static inline struct page *mlock_lru(struct page *page) +#define LRU_FOLIO 0x1 +#define NEW_FOLIO 0x2 +static inline struct folio *mlock_lru(struct folio *folio) { - return (struct page *)((unsigned long)page + LRU_PAGE); + return (struct folio *)((unsigned long)folio + LRU_FOLIO); } -static inline struct page *mlock_new(struct page *page) +static inline struct folio *mlock_new(struct folio *folio) { - return (struct page *)((unsigned long)page + NEW_PAGE); + return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* - * mlock_pagevec() is derived from pagevec_lru_move_fn(): - * perhaps that can make use of such page pointer flags in future, - * but for now just keep it for mlock. We could use three separate - * pagevecs instead, but one feels better (munlocking a full pagevec - * does not need to drain mlocking pagevecs first). + * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can + * make use of such folio pointer flags in future, but for now just keep it for + * mlock. We could use three separate folio batches instead, but one feels + * better (munlocking a full folio batch does not need to drain mlocking folio + * batches first). */ -static void mlock_pagevec(struct pagevec *pvec) +static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; - struct page *page; + struct folio *folio; int i; - for (i = 0; i < pagevec_count(pvec); i++) { - page = pvec->pages[i]; - mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); - page = (struct page *)((unsigned long)page - mlock); - pvec->pages[i] = page; + for (i = 0; i < folio_batch_count(fbatch); i++) { + folio = fbatch->folios[i]; + mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); + folio = (struct folio *)((unsigned long)folio - mlock); + fbatch->folios[i] = folio; - if (mlock & LRU_PAGE) - lruvec = __mlock_page(page, lruvec); - else if (mlock & NEW_PAGE) - lruvec = __mlock_new_page(page, lruvec); + if (mlock & LRU_FOLIO) + lruvec = __mlock_folio(folio, lruvec); + else if (mlock & NEW_FOLIO) + lruvec = __mlock_new_folio(folio, lruvec); else - lruvec = __munlock_page(page, lruvec); + lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); + release_pages(fbatch->folios, fbatch->nr); + folio_batch_reinit(fbatch); } -void mlock_page_drain_local(void) +void mlock_drain_local(void) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } -void mlock_page_drain_remote(int cpu) +void mlock_drain_remote(int cpu) { - struct pagevec *pvec; + struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); - pvec = &per_cpu(mlock_pvec.vec, cpu); - if (pagevec_count(pvec)) - mlock_pagevec(pvec); + fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); + if (folio_batch_count(fbatch)) + mlock_folio_batch(fbatch); } -bool need_mlock_page_drain(int cpu) +bool need_mlock_drain(int cpu) { - return pagevec_count(&per_cpu(mlock_pvec.vec, cpu)); + return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** @@ -242,10 +242,10 @@ bool need_mlock_page_drain(int cpu) */ void mlock_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); @@ -255,54 +255,54 @@ void mlock_folio(struct folio *folio) } folio_get(folio); - if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** - * mlock_new_page - mlock a newly allocated page not yet on LRU - * @page: page to be mlocked, either a normal page or a THP head. + * mlock_new_folio - mlock a newly allocated folio not yet on LRU + * @folio: folio to be mlocked, either normal or a THP head. */ -void mlock_new_page(struct page *page) +void mlock_new_folio(struct folio *folio) { - struct pagevec *pvec; - int nr_pages = thp_nr_pages(page); + struct folio_batch *fbatch; + int nr_pages = folio_nr_pages(folio); + + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); + folio_set_mlocked(folio); - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); - SetPageMlocked(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); - get_page(page); - if (!pagevec_add(pvec, mlock_new(page)) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, mlock_new(folio)) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } /** - * munlock_page - munlock a page - * @page: page to be munlocked, either a normal page or a THP head. + * munlock_folio - munlock a folio + * @folio: folio to be munlocked, either normal or a THP head. */ -void munlock_page(struct page *page) +void munlock_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - local_lock(&mlock_pvec.lock); - pvec = this_cpu_ptr(&mlock_pvec.vec); + local_lock(&mlock_fbatch.lock); + fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* - * TestClearPageMlocked(page) must be left to __munlock_page(), - * which will check whether the page is multiply mlocked. + * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), + * which will check whether the folio is multiply mlocked. */ - - get_page(page); - if (!pagevec_add(pvec, page) || - PageHead(page) || lru_cache_disabled()) - mlock_pagevec(pvec); - local_unlock(&mlock_pvec.lock); + folio_get(folio); + if (!folio_batch_add(fbatch, folio) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_folio_batch(fbatch); + local_unlock(&mlock_fbatch.lock); } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, @@ -312,7 +312,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; - struct page *page; + struct folio *folio; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -320,11 +320,11 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, goto out; if (is_huge_zero_pmd(*pmd)) goto out; - page = pmd_page(*pmd); + folio = page_folio(pmd_page(*pmd)); if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); goto out; } @@ -332,15 +332,15 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; - page = vm_normal_page(vma, addr, *pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) continue; - if (PageTransCompound(page)) + if (folio_test_large(folio)) continue; if (vma->vm_flags & VM_LOCKED) - mlock_folio(page_folio(page)); + mlock_folio(folio); else - munlock_page(page); + munlock_folio(folio); } pte_unmap(start_pte); out: @@ -370,9 +370,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, - * will call mlock_vma_page() and raise page's mlock_count: + * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. - * Communicate this danger to mlock_vma_page() with VM_IO, + * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; @@ -380,7 +380,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); @@ -388,7 +388,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, if (newflags & VM_IO) { newflags &= ~VM_IO; - WRITE_ONCE(vma->vm_flags, newflags); + vm_flags_reset(vma, newflags); } } @@ -401,8 +401,9 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, * * For vmas that pass the filters, merge/split as appropriate. */ -static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, vm_flags_t newflags) +static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; @@ -417,22 +418,22 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + *prev = vma_merge(vmi, mm, *prev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*prev) { vma = *prev; goto success; } if (start != vma->vm_start) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(vmi, vma, start, 1); if (ret) goto out; } if (end != vma->vm_end) { - ret = split_vma(mm, vma, end, 0); + ret = split_vma(vmi, vma, end, 0); if (ret) goto out; } @@ -456,7 +457,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } @@ -471,7 +472,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; - MA_STATE(mas, ¤t->mm->mm_mt, start, start); + VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -480,39 +481,37 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); - for (nstart = start ; ; ) { - vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { + vm_flags_t newflags; + + if (vma->vm_start != tmp) + return -ENOMEM; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; - error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(prev->vm_mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } } + + if (vma_iter_end(&vmi) < end) + return -ENOMEM; + return error; } @@ -658,11 +657,11 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; @@ -679,15 +678,15 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - mas_for_each(&mas, vma, ULONG_MAX) { + for_each_vma(vmi, vma) { vm_flags_t newflags; - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; /* Ignore errors */ - mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - mas_pause(&mas); + mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); cond_resched(); } out: diff --git a/mm/mmap.c b/mm/mmap.c index 425a9349e6108..20f21f0949ddb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -78,7 +78,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end); + unsigned long end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -144,6 +144,24 @@ static void remove_vma(struct vm_area_struct *vma) vm_area_free(vma); } +static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, + unsigned long min) +{ + return mas_prev(&vmi->mas, min); +} + +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. @@ -162,10 +180,7 @@ static int check_brk_limits(unsigned long addr, unsigned long len) return mlock_future_check(current->mm, current->mm->def_flags, len); } -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf); -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -176,7 +191,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool populate; bool downgraded = false; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; @@ -218,23 +233,23 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * do_brk_munmap() may downgrade mmap_lock to read. + * do_vma_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; /* Search one past newbrk */ - mas_set(&mas, newbrk); - brkvma = mas_find(&mas, oldbrk); + vma_iter_init(&vmi, mm, newbrk); + brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. - * do_brk_munmap() may downgrade the lock, so update it - * before calling do_brk_munmap(). + * do_vma_munmap() may downgrade the lock, so update it + * before calling do_vma_munmap(). */ mm->brk = brk; - ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + ret = do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true); if (ret == 1) { downgraded = true; goto success; @@ -252,14 +267,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ - mas_set(&mas, oldbrk); - next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); + vma_iter_init(&vmi, mm, oldbrk); + next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; - brkvma = mas_prev(&mas, mm->start_brk); + brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; @@ -417,85 +432,218 @@ static void __vma_link_file(struct vm_area_struct *vma, flush_dcache_mmap_unlock(mapping); } +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + VMA_ITERATOR(vmi, mm, 0); + struct address_space *mapping = NULL; + + if (vma_iter_prealloc(&vmi)) + return -ENOMEM; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } + + vma_iter_store(&vmi, vma); + + if (mapping) { + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } + + mm->map_count++; + validate_mm(mm); + return 0; +} + /* - * vma_mas_store() - Store a VMA in the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to store a VMA in the maple tree when the @mas has already - * walked to the correct location. - * - * Note: the end address is inclusive in the maple tree. + * init_multi_vma_prep() - Initializer for struct vma_prepare + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked + * @next: The next vma if it is to be adjusted + * @remove: The first vma to be removed + * @remove2: The second vma to be removed */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +static inline void init_multi_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma, struct vm_area_struct *next, + struct vm_area_struct *remove, struct vm_area_struct *remove2) { - trace_vma_store(mas->tree, vma); - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); + memset(vp, 0, sizeof(struct vma_prepare)); + vp->vma = vma; + vp->anon_vma = vma->anon_vma; + vp->remove = remove; + vp->remove2 = remove2; + vp->adj_next = next; + if (!vp->anon_vma && next) + vp->anon_vma = next->anon_vma; + + vp->file = vma->vm_file; + if (vp->file) + vp->mapping = vma->vm_file->f_mapping; + } /* - * vma_mas_remove() - Remove a VMA from the maple tree. - * @vma: The vm_area_struct - * @mas: The maple state - * - * Efficient way to remove a VMA from the maple tree when the @mas has already - * been established and points to the correct location. - * Note: the end address is inclusive in the maple tree. + * init_vma_prep() - Initializer wrapper for vma_prepare struct + * @vp: The vma_prepare struct + * @vma: The vma that will be altered once locked */ -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +static inline void init_vma_prep(struct vma_prepare *vp, + struct vm_area_struct *vma) { - trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); + init_multi_vma_prep(vp, vma, NULL, NULL, NULL); } + /* - * vma_mas_szero() - Set a given range to zero. Used when modifying a - * vm_area_struct start or end. - * - * @mas: The maple tree ma_state - * @start: The start address to zero - * @end: The end address to zero. + * vma_prepare() - Helper function for handling locking VMAs prior to altering + * @vp: The initialized vma_prepare struct */ -static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, - unsigned long end) +static inline void vma_prepare(struct vma_prepare *vp) { - trace_vma_mas_szero(mas->tree, start, end - 1); - mas_set_range(mas, start, end - 1); - mas_store_prealloc(mas, NULL); + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + + if (vp->adj_next) + uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, + vp->adj_next->vm_end); + + i_mmap_lock_write(vp->mapping); + if (vp->insert && vp->insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(vp->insert, + vp->insert->vm_file->f_mapping); + } + } + + if (vp->anon_vma) { + anon_vma_lock_write(vp->anon_vma); + anon_vma_interval_tree_pre_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_pre_update_vma(vp->adj_next); + } + + if (vp->file) { + flush_dcache_mmap_lock(vp->mapping); + vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); + if (vp->adj_next) + vma_interval_tree_remove(vp->adj_next, + &vp->mapping->i_mmap); + } + } -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) -{ - MA_STATE(mas, &mm->mm_mt, 0, 0); - struct address_space *mapping = NULL; +/* + * vma_complete- Helper function for handling the unlocking after altering VMAs, + * or for inserting a VMA. + * + * @vp: The vma_prepare struct + * @vmi: The vma iterator + * @mm: The mm_struct + */ +static inline void vma_complete(struct vma_prepare *vp, + struct vma_iterator *vmi, struct mm_struct *mm) +{ + if (vp->file) { + if (vp->adj_next) + vma_interval_tree_insert(vp->adj_next, + &vp->mapping->i_mmap); + vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); + flush_dcache_mmap_unlock(vp->mapping); + } + + if (vp->remove && vp->file) { + __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); + if (vp->remove2) + __remove_shared_vm_struct(vp->remove2, vp->file, + vp->mapping); + } else if (vp->insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + vma_iter_store(vmi, vp->insert); + mm->map_count++; + } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) - return -ENOMEM; + if (vp->anon_vma) { + anon_vma_interval_tree_post_update_vma(vp->vma); + if (vp->adj_next) + anon_vma_interval_tree_post_update_vma(vp->adj_next); + anon_vma_unlock_write(vp->anon_vma); + } - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); + if (vp->file) { + i_mmap_unlock_write(vp->mapping); + uprobe_mmap(vp->vma); + + if (vp->adj_next) + uprobe_mmap(vp->adj_next); } - vma_mas_store(vma, &mas); + if (vp->remove) { +again: + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); + fput(vp->file); + } + if (vp->remove->anon_vma) + anon_vma_merge(vp->vma, vp->remove); + mm->map_count--; + mpol_put(vma_policy(vp->remove)); + if (!vp->remove2) + WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); + vm_area_free(vp->remove); - if (mapping) { - __vma_link_file(vma, mapping); - i_mmap_unlock_write(mapping); + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove the one after next as well. + */ + if (vp->remove2) { + vp->remove = vp->remove2; + vp->remove2 = NULL; + goto again; + } + } + if (vp->insert && vp->file) + uprobe_mmap(vp->insert); +} + +/* + * dup_anon_vma() - Helper function to duplicate anon_vma + * @dst: The destination VMA + * @src: The source VMA + * + * Returns: 0 on success. + */ +static inline int dup_anon_vma(struct vm_area_struct *dst, + struct vm_area_struct *src) +{ + /* + * Easily overlooked: when mprotect shifts the boundary, make sure the + * expanding vma has anon_vma set if the shrinking vma had, to cover any + * anon pages imported. + */ + if (src->anon_vma && !dst->anon_vma) { + dst->anon_vma = src->anon_vma; + return anon_vma_clone(dst, src); } - mm->map_count++; - validate_mm(mm); return 0; } /* * vma_expand - Expand an existing VMA * - * @mas: The maple state + * @vmi: The vma iterator * @vma: The vma to expand * @start: The start of the vma * @end: The exclusive end of the vma @@ -509,96 +657,46 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) * * Returns: 0 on success */ -inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next) +int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) { - struct mm_struct *mm = vma->vm_mm; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; - struct anon_vma *anon_vma = vma->anon_vma; - struct file *file = vma->vm_file; bool remove_next = false; + struct vma_prepare vp; if (next && (vma != next) && (end == next->vm_end)) { - remove_next = true; - if (next->anon_vma && !vma->anon_vma) { - int error; + int ret; - anon_vma = next->anon_vma; - vma->anon_vma = anon_vma; - error = anon_vma_clone(vma, next); - if (error) - return error; - } + remove_next = true; + ret = dup_anon_vma(vma, next); + if (ret) + return ret; } + init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ - VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + VM_WARN_ON(next && !vp.remove && + next != vma && end > next->vm_start); /* Only handles expanding */ - VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto nomem; vma_adjust_trans_huge(vma, start, end, 0); + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - i_mmap_lock_write(mapping); - } - - if (anon_vma) { - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - } - + vma_prepare(&vp); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; /* Note: mas must be pointing to the expanding VMA */ - vma_mas_store(vma, mas); - - if (file) { - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } + vma_iter_store(vmi, vma); - /* Expanding over the next vma */ - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - } - - if (remove_next) { - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - vm_area_free(next); - } - - validate_mm(mm); + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); return 0; nomem: @@ -606,256 +704,39 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, } /* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. + * vma_shrink() - Reduce an existing VMAs memory area + * @vmi: The vma iterator + * @vma: The VMA to modify + * @start: The new start + * @end: The new end + * + * Returns: 0 on success, -ENOMEM otherwise */ -int __vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, - struct vm_area_struct *expand) +int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff) { - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next = NULL; /* uninit var warning */ - struct vm_area_struct *next = find_vma(mm, vma->vm_end); - struct vm_area_struct *orig_vma = vma; - struct address_space *mapping = NULL; - struct rb_root_cached *root = NULL; - struct anon_vma *anon_vma = NULL; - struct file *file = vma->vm_file; - bool vma_changed = false; - long adjust_next = 0; - int remove_next = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); - struct vm_area_struct *exporter = NULL, *importer = NULL; + struct vma_prepare vp; - if (next && !insert) { - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - * The only other cases that gets here are - * case 1, case 7 and case 8. - */ - if (next == expand) { - /* - * The only case where we don't expand "vma" - * and we expand "next" instead is case 8. - */ - VM_WARN_ON(end != next->vm_end); - /* - * remove_next == 3 means we're - * removing "vma" and that to do so we - * swapped "vma" and "next". - */ - remove_next = 3; - VM_WARN_ON(file != next->vm_file); - swap(vma, next); - } else { - VM_WARN_ON(expand != vma); - /* - * case 1, 6, 7, remove_next == 2 is case 6, - * remove_next == 1 is case 1 or 7. - */ - remove_next = 1 + (end > next->vm_end); - if (remove_next == 2) - next_next = find_vma(mm, next->vm_end); - - VM_WARN_ON(remove_next == 2 && - end != next_next->vm_end); - } + WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); - exporter = next; - importer = vma; - - /* - * If next doesn't have anon_vma, import from vma after - * next, if the vma overlaps with it. - */ - if (remove_next == 2 && !next->anon_vma) - exporter = next_next; - - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start); - exporter = next; - importer = vma; - VM_WARN_ON(expand != importer); - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = -(vma->vm_end - end); - exporter = vma; - importer = next; - VM_WARN_ON(expand != importer); - } - - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (exporter && exporter->anon_vma && !importer->anon_vma) { - int error; - - importer->anon_vma = exporter->anon_vma; - error = anon_vma_clone(importer, exporter); - if (error) - return error; - } - } - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) return -ENOMEM; - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); - if (file) { - mapping = file->f_mapping; - root = &mapping->i_mmap; - uprobe_munmap(vma, vma->vm_start, vma->vm_end); - - if (adjust_next) - uprobe_munmap(next, next->vm_start, next->vm_end); - - i_mmap_lock_write(mapping); - if (insert && insert->vm_file) { - /* - * Put into interval tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert, insert->vm_file->f_mapping); - } - } - - anon_vma = vma->anon_vma; - if (!anon_vma && adjust_next) - anon_vma = next->anon_vma; - if (anon_vma) { - VM_WARN_ON(adjust_next && next->anon_vma && - anon_vma != next->anon_vma); - anon_vma_lock_write(anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_pre_update_vma(next); - } - - if (file) { - flush_dcache_mmap_lock(mapping); - vma_interval_tree_remove(vma, root); - if (adjust_next) - vma_interval_tree_remove(next, root); - } + init_vma_prep(&vp, vma); + vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); - if (start != vma->vm_start) { - if ((vma->vm_start < start) && - (!insert || (insert->vm_end != start))) { - vma_mas_szero(&mas, vma->vm_start, start); - VM_WARN_ON(insert && insert->vm_start > vma->vm_start); - } else { - vma_changed = true; - } - vma->vm_start = start; - } - if (end != vma->vm_end) { - if (vma->vm_end > end) { - if (!insert || (insert->vm_start != end)) { - vma_mas_szero(&mas, end, vma->vm_end); - mas_reset(&mas); - VM_WARN_ON(insert && - insert->vm_end < vma->vm_end); - } - } else { - vma_changed = true; - } - vma->vm_end = end; - } + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); - if (vma_changed) - vma_mas_store(vma, &mas); + if (vma->vm_end > end) + vma_iter_clear(vmi, end, vma->vm_end); + vma->vm_start = start; + vma->vm_end = end; vma->vm_pgoff = pgoff; - if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; - vma_mas_store(next, &mas); - } - - if (file) { - if (adjust_next) - vma_interval_tree_insert(next, root); - vma_interval_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next && file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - mas_reset(&mas); - vma_mas_store(insert, &mas); - mm->map_count++; - } - - if (anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - if (adjust_next) - anon_vma_interval_tree_post_update_vma(next); - anon_vma_unlock_write(anon_vma); - } - - if (file) { - i_mmap_unlock_write(mapping); - uprobe_mmap(vma); - - if (adjust_next) - uprobe_mmap(next); - } - - if (remove_next) { -again: - if (file) { - uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } - if (next->anon_vma) - anon_vma_merge(vma, next); - mm->map_count--; - mpol_put(vma_policy(next)); - if (remove_next != 2) - BUG_ON(vma->vm_end < next->vm_end); - vm_area_free(next); - - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove next_next too. - */ - if (remove_next == 2) { - remove_next = 1; - next = next_next; - goto again; - } - } - if (insert && file) - uprobe_mmap(insert); - - mas_destroy(&mas); - validate_mm(mm); - + vma_complete(&vp, vmi, vma->vm_mm); + validate_mm(vma->vm_mm); return 0; } @@ -864,9 +745,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -985,7 +866,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must * be extended in region AAAA and NNNN must be removed. This way in - * all cases where vma_merge succeeds, the moment vma_adjust drops the + * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must @@ -995,8 +876,14 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of NNNN * instead of the right permissions of XXXX. + * + * In the code below: + * PPPP is represented by *prev + * NNNN is represented by *mid (and possibly equal to *next) + * XXXX is represented by *next or not represented at all. + * AAAA is not represented - it will be merged or the function will return NULL */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, @@ -1005,11 +892,19 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *mid, *next, *res; + pgoff_t vma_pgoff; + struct vm_area_struct *mid, *next, *res = NULL; + struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; bool merge_next = false; + bool vma_expanded = false; + struct vma_prepare vp; + unsigned long vma_end = end; + long adj_next = 0; + unsigned long vma_start = addr; + validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1027,13 +922,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* Can we merge the predecessor? */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff, - vm_userfaultfd_ctx, anon_name)) { - merge_prev = true; + if (prev) { + res = prev; + vma = prev; + vma_start = prev->vm_start; + vma_pgoff = prev->vm_pgoff; + /* Can we merge the predecessor? */ + if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) + && can_vma_merge_after(prev, vm_flags, anon_vma, file, + pgoff, vm_userfaultfd_ctx, anon_name)) { + merge_prev = true; + vma_prev(vmi); + } } /* Can we merge the successor? */ if (next && end == next->vm_start && @@ -1043,34 +943,87 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } + + remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - res = prev; - } else if (merge_prev) { /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - res = prev; + is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + remove = mid; /* case 1 */ + vma_end = next->vm_end; + err = dup_anon_vma(res, remove); + if (mid != next) { /* case 6 */ + remove2 = next; + if (!remove->anon_vma) + err = dup_anon_vma(res, remove2); + } + } else if (merge_prev) { + err = 0; /* case 2 */ + if (mid && end > mid->vm_start) { + err = dup_anon_vma(res, mid); + if (end == mid->vm_end) { /* case 7 */ + remove = mid; + } else { /* case 5 */ + adjust = mid; + adj_next = (end - mid->vm_start); + } + } } else if (merge_next) { - if (prev && addr < prev->vm_end) /* case 4 */ - err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else /* cases 3, 8 */ - err = __vma_adjust(mid, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); res = next; + if (prev && addr < prev->vm_end) { /* case 4 */ + vma_end = addr; + adjust = mid; + adj_next = -(vma->vm_end - addr); + err = dup_anon_vma(res, adjust); + } else { + vma = next; /* case 3 */ + vma_start = addr; + vma_end = next->vm_end; + vma_pgoff = mid->vm_pgoff; + err = 0; + if (mid != next) { /* case 8 */ + remove = mid; + err = dup_anon_vma(res, remove); + } + } } - /* - * Cannot merge with predecessor or successor or error in __vma_adjust? - */ + /* Cannot merge or error in anon_vma clone */ if (err) return NULL; + + if (vma_iter_prealloc(vmi)) + return NULL; + + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + vma->vm_start = vma_start; + vma->vm_end = vma_end; + vma->vm_pgoff = vma_pgoff; + + if (vma_expanded) + vma_iter_store(vmi, vma); + + if (adj_next) { + adjust->vm_start += adj_next; + adjust->vm_pgoff += adj_next >> PAGE_SHIFT; + if (adj_next < 0) { + WARN_ON(vma_expanded); + vma_iter_store(vmi, next); + } + } + + vma_complete(&vp, vmi, mm); + vma_iter_free(vmi); + validate_mm(mm); khugepaged_enter_vma(res, vm_flags); + return res; } @@ -1558,8 +1511,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ @@ -1585,11 +1538,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) /** * unmapped_area_topdown() - Find an area between the low_limit and the - * high_limit with * the correct alignment and offset at the highest available + * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * - * @info: The unmapped area information including the range (low_limit - - * hight_limit), the alignment offset and mask. + * @info: The unmapped area information including the range [low_limit - + * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ @@ -1938,7 +1891,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -1981,7 +1934,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, vma->vm_start, address - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2019,7 +1973,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(&mas, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2063,7 +2017,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - vma_mas_store(vma, &mas); + mas_set_range(&mas, address, vma->vm_end - 1); + mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2178,14 +2133,14 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool mm_wr_locked) { struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); @@ -2194,13 +2149,19 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. + * VMA Iterator will point to the end VMA. */ -int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { + struct vma_prepare vp; struct vm_area_struct *new; int err; - validate_mm_mt(mm); + + validate_mm_mt(vma->vm_mm); + + WARN_ON(vma->vm_start >= addr); + WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2212,16 +2173,20 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) return -ENOMEM; - if (new_below) + err = -ENOMEM; + if (vma_iter_prealloc(vmi)) + goto out_free_vma; + + if (new_below) { new->vm_end = addr; - else { + } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = vma_dup_policy(vma, new); if (err) - goto out_free_vma; + goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) @@ -2233,30 +2198,34 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) - err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); - else - err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); + + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; + } else { + vma->vm_end = addr; + } + + /* vma_complete stores the new vma */ + vma_complete(&vp, vmi, vma->vm_mm); /* Success. */ - if (!err) - return 0; + if (new_below) + vma_next(vmi); + validate_mm_mt(vma->vm_mm); + return 0; - /* Avoid vm accounting in close() operation */ - new->vm_start = new->vm_end; - new->vm_pgoff = 0; - /* Clean everything up if vma_adjust failed. */ - if (new->vm_ops && new->vm_ops->close) - new->vm_ops->close(new); - if (new->vm_file) - fput(new->vm_file); - unlink_anon_vmas(new); - out_free_mpol: +out_free_mpol: mpol_put(vma_policy(new)); - out_free_vma: +out_free_vmi: + vma_iter_free(vmi); +out_free_vma: vm_area_free(new); - validate_mm_mt(mm); + validate_mm_mt(vma->vm_mm); return err; } @@ -2264,13 +2233,13 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { - if (mm->map_count >= sysctl_max_map_count) + if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; - return __split_vma(mm, vma, addr, new_below); + return __split_vma(vmi, vma, addr, new_below); } static inline int munmap_sidetree(struct vm_area_struct *vma, @@ -2287,8 +2256,8 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, } /* - * do_mas_align_munmap() - munmap the aligned region from @start to @end. - * @mas: The maple_state, ideally set up to alter the correct tree location. + * do_vmi_align_munmap() - munmap the aligned region from @start to @end. + * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. @@ -2299,7 +2268,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, * If @downgrade is true, check return code for potential release of the lock. */ static int -do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, +do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { @@ -2311,10 +2280,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); mt_set_external_lock(&mt_detach, &mm->mmap_lock); - if (mas_preallocate(mas, vma, GFP_KERNEL)) - return -ENOMEM; - - mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2334,45 +2299,27 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - /* - * mas_pause() is not needed since mas->index needs to be set - * differently than vma->vm_end anyways. - */ - error = __split_vma(mm, vma, start, 0); + error = __split_vma(vmi, vma, start, 0); if (error) goto start_split_failed; - mas_set(mas, start); - vma = mas_walk(mas); + vma = vma_iter_load(vmi); } - prev = mas_prev(mas, 0); + prev = vma_prev(vmi); if (unlikely((!prev))) - mas_set(mas, start); + vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - mas_for_each(mas, next, end - 1) { + for_each_vma_range(*vmi, next, end) { /* Does it split the end? */ if (next->vm_end > end) { - struct vm_area_struct *split; - - error = __split_vma(mm, next, end, 1); + error = __split_vma(vmi, next, end, 0); if (error) goto end_split_failed; - - mas_set(mas, end); - split = mas_prev(mas, 0); - error = munmap_sidetree(split, &mas_detach); - if (error) - goto munmap_sidetree_failed; - - count++; - if (vma == next) - vma = split; - break; } error = munmap_sidetree(next, &mas_detach); if (error) @@ -2385,9 +2332,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, #endif } - if (!next) - next = mas_next(mas, ULONG_MAX); - + next = vma_next(vmi); if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2404,8 +2349,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* Point of no return */ - mas_set_range(mas, start, end - 1); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { @@ -2413,19 +2356,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; + vma_iter_set(vmi, start); rcu_read_lock(); vma_test = mas_find(&test, end - 1); - mas_for_each(mas, vma_mas, end - 1) { + for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, end - 1); } rcu_read_unlock(); BUG_ON(count != test_count); - mas_set_range(mas, start, end - 1); } #endif - mas_store_prealloc(mas, NULL); + /* Point of no return */ + vma_iter_set(vmi, start); + if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) + return -ENOMEM; + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2441,7 +2388,11 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* + * We can free page tables without write-locking mmap_lock because VMAs + * were isolated before we downgraded mmap_lock. + */ + unmap_region(mm, &mt_detach, vma, prev, next, start, end, !downgrade); /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); @@ -2457,13 +2408,12 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: - mas_destroy(mas); return error; } /* - * do_mas_munmap() - munmap a given range. - * @mas: The maple state + * do_vmi_munmap() - munmap a given range. + * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap @@ -2477,7 +2427,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. */ -int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, +int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool downgrade) { @@ -2495,11 +2445,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, arch_unmap(mm, start, end); /* Find the first overlapping VMA */ - vma = mas_find(mas, end - 1); + vma = vma_find(vmi, end); if (!vma) return 0; - return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); + return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); } /* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. @@ -2511,9 +2461,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); - return do_mas_munmap(&mas, mm, start, len, uf, false); + return do_vmi_munmap(&vmi, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2529,7 +2479,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long merge_start = addr, merge_end = end; pgoff_t vm_pgoff; int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); + VMA_ITERATOR(vmi, mm, addr); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -2547,7 +2497,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_mas_munmap(&mas, mm, addr, len, uf, false)) + if (do_vmi_munmap(&vmi, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2560,8 +2510,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); + next = vma_next(&vmi); + prev = vma_prev(&vmi); if (vm_flags & VM_SPECIAL) goto cannot_expand; @@ -2589,13 +2539,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { khugepaged_enter_vma(vma, vm_flags); goto expanded; } - mas.index = addr; - mas.last = end - 1; cannot_expand: /* * Determine the object being mapped and call the appropriate @@ -2608,9 +2556,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto unacct_error; } + vma_iter_set(&vmi, addr); vma->vm_start = addr; vma->vm_end = end; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -2630,19 +2579,20 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. */ - if (WARN_ON((addr != vma->vm_start))) { - error = -EINVAL; + error = -EINVAL; + if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - } - mas_reset(&mas); + vma_iter_set(&vmi, addr); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + merge = vma_merge(&vmi, mm, prev, vma->vm_start, + vma->vm_end, vma->vm_flags, NULL, + vma->vm_file, vma->vm_pgoff, NULL, + NULL_VM_UFFD_CTX, NULL); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -2669,9 +2619,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_anonymous(vma); } - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; + if (map_deny_write_exec(vma, vma->vm_flags)) { + error = -EACCES; if (file) goto close_and_free_vma; else if (vma->vm_file) @@ -2680,20 +2629,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - error = -ENOMEM; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } + /* Allow architectures to sanity-check the vm_flags */ + error = -EINVAL; + if (!arch_validate_flags(vma->vm_flags)) + goto close_and_free_vma; + + error = -ENOMEM; + if (vma_iter_prealloc(&vmi)) + goto close_and_free_vma; if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); - vma_mas_store(vma, &mas); + vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2724,7 +2672,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -2739,7 +2687,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); @@ -2747,14 +2695,18 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return addr; close_and_free_vma: - if (vma->vm_ops && vma->vm_ops->close) + if (file && vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); + + if (file || vma->vm_file) { unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; + fput(vma->vm_file); + vma->vm_file = NULL; - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, + vma->vm_end, true); + } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2771,12 +2723,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); + ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2889,33 +2841,34 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } /* - * brk_munmap() - Unmap a parital vma. - * @mas: The maple tree state. - * @vma: The vma to be modified - * @newbrk: the start of the address to unmap - * @oldbrk: The end of the address to unmap + * do_vma_munmap() - Unmap a full or partial vma. + * @vmi: The vma iterator pointing at the vma + * @vma: The first vma to be munmapped + * @start: the start of the address to unmap + * @end: The end of the address to unmap * @uf: The userfaultfd list_head + * @downgrade: Attempt to downgrade or not * - * Returns: 1 on success. - * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if - * possible. + * Returns: 0 on success and not downgraded, 1 on success and downgraded. + * unmaps a VMA mapping when the vma iterator is already in position. + * Does not handle alignment. */ -static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long newbrk, unsigned long oldbrk, - struct list_head *uf) +int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf, bool downgrade) { struct mm_struct *mm = vma->vm_mm; int ret; - arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + arch_unmap(mm, start, end); + ret = do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade); validate_mm_mt(mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. - * @mas: The maple tree state. + * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, @@ -2925,10 +2878,11 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ -static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, +static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vma_prepare vp; validate_mm_mt(mm); /* @@ -2952,23 +2906,17 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_preallocate(mas, vma, GFP_KERNEL)) + if (vma_iter_prealloc(vmi)) goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); - if (vma->anon_vma) { - anon_vma_lock_write(vma->anon_vma); - anon_vma_interval_tree_pre_update_vma(vma); - } + init_vma_prep(&vp, vma); + vma_prepare(&vp); vma->vm_end = addr + len; - vma->vm_flags |= VM_SOFTDIRTY; - mas_store_prealloc(mas, vma); + vm_flags_set(vma, VM_SOFTDIRTY); + vma_iter_store(vmi, vma); - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } + vma_complete(&vp, vmi, mm); khugepaged_enter_vma(vma, flags); goto out; } @@ -2982,10 +2930,9 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, vma->vm_start = addr; vma->vm_end = addr + len; vma->vm_pgoff = addr >> PAGE_SHIFT; - vma->vm_flags = flags; + vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); - mas_set_range(mas, vma->vm_start, addr + len - 1); - if (mas_store_gfp(mas, vma, GFP_KERNEL)) + if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; @@ -2995,7 +2942,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); - vma->vm_flags |= VM_SOFTDIRTY; + vm_flags_set(vma, VM_SOFTDIRTY); validate_mm(mm); return 0; @@ -3014,7 +2961,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) int ret; bool populate; LIST_HEAD(uf); - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); len = PAGE_ALIGN(request); if (len < request) @@ -3033,12 +2980,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; - vma = mas_prev(&mas, 0); - ret = do_brk_flags(&mas, vma, addr, len, flags); + vma = vma_prev(&vmi); + ret = do_brk_flags(&vmi, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -3086,7 +3033,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); mmap_read_unlock(mm); /* @@ -3174,6 +3121,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; bool faulted_in_anon_vma = true; + VMA_ITERATOR(vmi, mm, addr); validate_mm_mt(mm); /* @@ -3189,7 +3137,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (new_vma) { @@ -3394,8 +3342,8 @@ static struct vm_area_struct *__install_special_mapping( vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_init(vma, (vm_flags | mm->def_flags | + VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = ops; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f45ff1b7626a6..50c0dde1354f4 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -1120,13 +1120,3 @@ void mmu_notifier_synchronize(void) synchronize_srcu(&srcu); } EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); - -bool -mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) -{ - if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) - return false; - /* Return true if the vma still have the read flag set. */ - return range->vma->vm_flags & VM_READ; -} -EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 92fc6f3fa5125..1d4843c97c2a1 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -398,7 +398,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (!range.start) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, - vma, vma->vm_mm, addr, end); + vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -585,9 +585,9 @@ static const struct mm_walk_ops prot_none_walk_ops = { }; int -mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, - struct vm_area_struct **pprev, unsigned long start, - unsigned long end, unsigned long newflags) +mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; @@ -642,7 +642,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * First try to merge with previous and/or next vma. */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(mm, *pprev, start, end, newflags, + *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (*pprev) { @@ -654,13 +654,13 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, *pprev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); + error = split_vma(vmi, vma, start, 1); if (error) goto fail; } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); + error = split_vma(vmi, vma, end, 0); if (error) goto fail; } @@ -670,7 +670,7 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); @@ -709,7 +709,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + struct vma_iterator vmi; start = untagged_addr(start); @@ -741,8 +741,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - mas_set(&mas, start); - vma = mas_find(&mas, ULONG_MAX); + vma_iter_init(&vmi, current->mm, start); + vma = vma_find(&vmi, end); error = -ENOMEM; if (!vma) goto out; @@ -765,18 +765,22 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } } + prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; - else - prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); - for (nstart = start ; ; ) { + nstart = start; + tmp = vma->vm_start; + for_each_vma_range(vmi, vma, end) { unsigned long mask_off_old_flags; unsigned long newflags; int new_vma_pkey; - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + if (vma->vm_start != tmp) { + error = -ENOMEM; + break; + } /* Does the application expect PROT_READ to imply PROT_EXEC */ if (rier && (vma->vm_flags & VM_MAYEXEC)) @@ -799,6 +803,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } + if (map_deny_write_exec(vma, newflags)) { + error = -EACCES; + goto out; + } + /* Allow architectures to sanity-check the new flags */ if (!arch_validate_flags(newflags)) { error = -EINVAL; @@ -819,25 +828,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); + error = mprotect_fixup(&vmi, &tlb, vma, &prev, nstart, tmp, newflags); if (error) break; nstart = tmp; - - if (nstart < prev->vm_end) - nstart = prev->vm_end; - if (nstart >= end) - break; - - vma = find_vma(current->mm, prev->vm_end); - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - break; - } prot = reqprot; } tlb_finish_mmu(&tlb); + + if (vma_iter_end(&vmi) < end) + error = -ENOMEM; + out: mmap_write_unlock(current->mm); return error; diff --git a/mm/mremap.c b/mm/mremap.c index 930f65c315c02..411a85682b580 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -498,7 +498,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_addr, len); flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); mmu_notifier_invalidate_range_start(&range); @@ -580,11 +580,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long vm_flags = vma->vm_flags; unsigned long new_pgoff; unsigned long moved_len; - unsigned long excess = 0; + unsigned long account_start = 0; + unsigned long account_end = 0; unsigned long hiwater_vm; - int split = 0; int err = 0; bool need_rmap_locks; + struct vma_iterator vmi; /* * We'd prefer to avoid failure later on in do_munmap: @@ -661,11 +662,11 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { - vma->vm_flags &= ~VM_ACCOUNT; - excess = vma->vm_end - vma->vm_start - old_len; - if (old_addr > vma->vm_start && - old_addr + old_len < vma->vm_end) - split = 1; + vm_flags_clear(vma, VM_ACCOUNT); + if (vma->vm_start < old_addr) + account_start = vma->vm_start; + if (vma->vm_end > old_addr + old_len) + account_end = vma->vm_end; } /* @@ -686,7 +687,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { /* We always clear VM_LOCKED[ONFAULT] on the old vma */ - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_flags_clear(vma, VM_LOCKED_MASK); /* * anon_vma links of the old vma is no longer needed after its page @@ -700,11 +701,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } - if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + vma_iter_init(&vmi, mm, old_addr); + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); - excess = 0; + account_start = account_end = 0; } if (vm_flags & VM_LOCKED) { @@ -715,10 +717,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ - if (excess) { - vma->vm_flags |= VM_ACCOUNT; - if (split) - find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; + if (account_start) { + vma = vma_prev(&vmi); + vm_flags_set(vma, VM_ACCOUNT); + } + + if (account_end) { + vma = vma_next(&vmi); + vm_flags_set(vma, VM_ACCOUNT); } return new_addr; @@ -978,14 +984,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * do_mas_munmap does all the needed commit accounting, and + * do_vmi_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; - MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); + VMA_ITERATOR(vmi, mm, addr + new_len); - retval = do_mas_munmap(&mas, mm, addr + new_len, + retval = do_vmi_munmap(&vmi, mm, addr + new_len, old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ if (retval == 1) { @@ -1018,6 +1024,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long extension_end = addr + new_len; pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); + VMA_ITERATOR(vmi, mm, extension_start); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1042,12 +1049,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * when a vma would be actually removed due to a merge. */ if (!vma->vm_ops || !vma->vm_ops->close) { - vma = vma_merge(mm, vma, extension_start, extension_end, - vma->vm_flags, vma->anon_vma, vma->vm_file, - extension_pgoff, vma_policy(vma), + vma = vma_merge(&vmi, mm, vma, extension_start, + extension_end, vma->vm_flags, vma->anon_vma, + vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - } else if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + } else if (vma_expand(&vmi, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL)) { vma = NULL; } if (!vma) { diff --git a/mm/nommu.c b/mm/nommu.c index df1711acdf5b3..57ba243c6a37f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -173,7 +173,7 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) mmap_write_lock(current->mm); vma = find_vma(current->mm, (unsigned long)ret); if (vma) - vma->vm_flags |= VM_USERMAP; + vm_flags_set(vma, VM_USERMAP); mmap_write_unlock(current->mm); } @@ -544,19 +544,6 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas_set_range(mas, vma->vm_start, vma->vm_end - 1); - mas_store_prealloc(mas, vma); -} - -void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) -{ - mas->index = vma->vm_start; - mas->last = vma->vm_end - 1; - mas_store_prealloc(mas, NULL); -} - static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { vma->vm_mm = mm; @@ -573,44 +560,6 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) } } -/* - * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). - * @mas: The maple state with preallocations. - * @mm: The mm_struct - * @vma: The vma to add - * - */ -static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, - struct vm_area_struct *vma) -{ - BUG_ON(!vma->vm_region); - - setup_vma_to_mm(vma, mm); - mm->map_count++; - - /* add the VMA to the tree */ - vma_mas_store(vma, mas); -} - -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) -{ - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - mas_add_vma_to_mm(&mas, mm, vma); - return 0; -} - static void cleanup_vma_from_mm(struct vm_area_struct *vma) { vma->vm_mm->map_count--; @@ -626,14 +575,15 @@ static void cleanup_vma_from_mm(struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } } + /* * delete a VMA from its owning mm_struct and address space */ static int delete_vma_from_mm(struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (vma_iter_prealloc(&vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -641,10 +591,9 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_mas_remove(vma, &mas); + vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); return 0; } - /* * destroy a VMA record */ @@ -675,9 +624,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - return mas_walk(&mas); + return vma_iter_load(&vmi); } EXPORT_SYMBOL(find_vma); @@ -709,9 +658,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; - MA_STATE(mas, &mm->mm_mt, addr, addr); + VMA_ITERATOR(vmi, mm, addr); - vma = mas_walk(&mas); + vma = vma_iter_load(&vmi); if (!vma) return NULL; if (vma->vm_start != addr) @@ -1001,7 +950,8 @@ static int do_mmap_private(struct vm_area_struct *vma, atomic_long_add(total, &mmap_pages_allocated); - region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); + region->vm_flags = vma->vm_flags; region->vm_start = (unsigned long) base; region->vm_end = region->vm_start + len; region->vm_top = region->vm_start + (total << PAGE_SHIFT); @@ -1062,7 +1012,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; - MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + VMA_ITERATOR(vmi, current->mm, 0); *populate = 0; @@ -1091,14 +1041,14 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) - goto error_maple_preallocate; + if (vma_iter_prealloc(&vmi)) + goto error_vma_iter_prealloc; region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - vma->vm_flags = vm_flags; + vm_flags_init(vma, vm_flags); vma->vm_pgoff = pgoff; if (file) { @@ -1162,7 +1112,7 @@ unsigned long do_mmap(struct file *file, vma->vm_end = start + len; if (pregion->vm_flags & VM_MAPPED_COPY) - vma->vm_flags |= VM_MAPPED_COPY; + vm_flags_set(vma, VM_MAPPED_COPY); else { ret = do_mmap_shared_file(vma); if (ret < 0) { @@ -1234,7 +1184,11 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - mas_add_vma_to_mm(&mas, current->mm, vma); + BUG_ON(!vma->vm_region); + setup_vma_to_mm(vma, current->mm); + current->mm->map_count++; + /* add the VMA to the tree */ + vma_iter_store(&vmi, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1250,7 +1204,7 @@ unsigned long do_mmap(struct file *file, error_just_free: up_write(&nommu_region_sem); error: - mas_destroy(&mas); + vma_iter_free(&vmi); if (region->vm_file) fput(region->vm_file); kmem_cache_free(vm_region_jar, region); @@ -1278,7 +1232,7 @@ unsigned long do_mmap(struct file *file, show_free_areas(0, NULL); return -ENOMEM; -error_maple_preallocate: +error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); @@ -1344,13 +1298,13 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, +int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; unsigned long npages; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + struct mm_struct *mm; /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1369,10 +1323,10 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (vma_iter_prealloc(vmi)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - goto err_mas_preallocate; + goto err_vmi_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1406,13 +1360,11 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, setup_vma_to_mm(vma, mm); setup_vma_to_mm(new, mm); - mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); - mas_store(&mas, vma); - vma_mas_store(new, &mas); + vma_iter_store(vmi, new); mm->map_count++; return 0; -err_mas_preallocate: +err_vmi_preallocate: vm_area_free(new); err_vma_dup: kmem_cache_free(vm_region_jar, region); @@ -1423,7 +1375,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * shrink a VMA by removing the specified chunk from either the beginning or * the end */ -static int shrink_vma(struct mm_struct *mm, +static int vmi_shrink_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long from, unsigned long to) { @@ -1431,14 +1383,19 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (delete_vma_from_mm(vma)) + if (vma_iter_prealloc(vmi)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); return -ENOMEM; - if (from > vma->vm_start) + } + + if (from > vma->vm_start) { + vma_iter_clear(vmi, from, vma->vm_end); vma->vm_end = from; - else + } else { + vma_iter_clear(vmi, vma->vm_start, to); vma->vm_start = to; - if (add_vma_to_mm(mm, vma)) - return -ENOMEM; + } /* cut the backing region down to size */ region = vma->vm_region; @@ -1466,7 +1423,7 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - MA_STATE(mas, &mm->mm_mt, start, start); + VMA_ITERATOR(vmi, mm, start); struct vm_area_struct *vma; unsigned long end; int ret = 0; @@ -1478,7 +1435,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = mas_find(&mas, end - 1); + vma = vma_find(&vmi, end); if (!vma) { static int limit; if (limit < 5) { @@ -1497,7 +1454,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = mas_next(&mas, end - 1); + vma = vma_find(&vmi, end); } while (vma); return -EINVAL; } else { @@ -1511,11 +1468,11 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; if (start != vma->vm_start && end != vma->vm_end) { - ret = split_vma(mm, vma, start, 1); + ret = split_vma(&vmi, vma, start, 1); if (ret < 0) return ret; } - return shrink_vma(mm, vma, start, end); + return vmi_shrink_vma(&vmi, vma, start, end); } erase_whole_vma: @@ -1645,7 +1602,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (addr != (pfn << PAGE_SHIFT)) return -EINVAL; - vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0a..044e1eed720ee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -542,7 +542,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, mm, vma->vm_start, + mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e91f94b3438be..516b1aa247e83 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2398,15 +2398,15 @@ int write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int error; - struct pagevec pvec; - int nr_pages; + struct folio_batch fbatch; + int nr_folios; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; - pagevec_init(&pvec); + folio_batch_init(&fbatch); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; @@ -2426,17 +2426,18 @@ int write_cache_pages(struct address_space *mapping, while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag); - if (nr_pages == 0) + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + + if (nr_folios == 0) break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; - done_index = page->index; + done_index = folio->index; - lock_page(page); + folio_lock(folio); /* * Page truncated or invalidated. We can freely skip it @@ -2446,30 +2447,30 @@ int write_cache_pages(struct address_space *mapping, * even if there is now a new, dirty page at the same * pagecache address. */ - if (unlikely(page->mapping != mapping)) { + if (unlikely(folio->mapping != mapping)) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (PageWriteback(page)) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); + folio_wait_writeback(folio); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - error = (*writepage)(page, wbc, data); + error = writepage(folio, wbc, data); if (unlikely(error)) { /* * Handle errors according to the type of @@ -2484,11 +2485,12 @@ int write_cache_pages(struct address_space *mapping, * the first error. */ if (error == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); + folio_unlock(folio); error = 0; } else if (wbc->sync_mode != WB_SYNC_ALL) { ret = error; - done_index = page->index + 1; + done_index = folio->index + + folio_nr_pages(folio); done = 1; break; } @@ -2508,7 +2510,7 @@ int write_cache_pages(struct address_space *mapping, break; } } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } @@ -2526,11 +2528,11 @@ int write_cache_pages(struct address_space *mapping, } EXPORT_SYMBOL(write_cache_pages); -static int writepage_cb(struct page *page, struct writeback_control *wbc, +static int writepage_cb(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); + int ret = mapping->a_ops->writepage(&folio->page, wbc); mapping_set_error(mapping, ret); return ret; } @@ -2650,7 +2652,7 @@ static void folio_account_dirtied(struct folio *folio, struct bdi_writeback *wb; long nr = folio_nr_pages(folio); - inode_attach_wb(inode, &folio->page); + inode_attach_wb(inode, folio); wb = inode_to_wb(inode); __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5514d84cc7120..5ebce58026f12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -430,6 +430,8 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +bool deferred_struct_pages __meminitdata; + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * During boot we initialize deferred pages on-demand, as needed, but once @@ -775,11 +777,13 @@ void free_compound_page(struct page *page) static void prep_compound_head(struct page *page, unsigned int order) { + struct folio *folio = (struct folio *)page; + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(&folio->_entire_mapcount, -1); + atomic_set(&folio->_nr_pages_mapped, 0); + atomic_set(&folio->_pincount, 0); } static void prep_compound_tail(struct page *head, int tail_idx) @@ -805,7 +809,7 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + enum compound_dtor_id dtor = folio->_folio_dtor; VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); @@ -1291,6 +1295,7 @@ static inline bool free_page_is_bad(struct page *page) static int free_tail_pages_check(struct page *head_page, struct page *page) { + struct folio *folio = (struct folio *)head_page; int ret = 1; /* @@ -1306,16 +1311,16 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) switch (page - head_page) { case 1: /* the first tail page: these may be in place of ->mapping */ - if (unlikely(head_compound_mapcount(head_page))) { - bad_page(page, "nonzero compound_mapcount"); + if (unlikely(folio_entire_mapcount(folio))) { + bad_page(page, "nonzero entire_mapcount"); goto out; } - if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { - bad_page(page, "nonzero subpages_mapcount"); + if (unlikely(atomic_read(&folio->_nr_pages_mapped))) { + bad_page(page, "nonzero nr_pages_mapped"); goto out; } - if (unlikely(head_compound_pincount(head_page))) { - bad_page(page, "nonzero compound_pincount"); + if (unlikely(atomic_read(&folio->_pincount))) { + bad_page(page, "nonzero pincount"); goto out; } break; @@ -2471,7 +2476,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); - bool reset_tags = !zero_tags; + bool reset_tags = true; int i; set_page_private(page, 0); @@ -2498,7 +2503,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * (which happens only when memory should be initialized as well). */ if (zero_tags) { - /* Initialize both memory and tags. */ + /* Initialize both memory and memory tags. */ for (i = 0; i != 1 << order; ++i) tag_clear_highpage(page + i); @@ -2516,14 +2521,15 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } else { /* * KASAN decided to exclude this allocation from being - * poisoned due to sampling. Skip poisoning as well. + * (un)poisoned due to sampling. Make KASAN skip + * poisoning when the allocation is freed. */ SetPageSkipKASanPoison(page); } } /* - * If memory tags have not been set, reset the page tags to ensure - * page_address() dereferencing does not fault. + * If memory tags have not been set by KASAN, reset the page tags to + * ensure page_address() dereferencing does not fault. */ if (reset_tags) { for (i = 0; i != 1 << order; ++i) @@ -3721,10 +3727,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, * reserved for high-order atomic allocation, so order-0 * request should skip it. */ - if (order > 0 && alloc_flags & ALLOC_HARDER) + if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* + * If the allocation fails, allow OOM handling access + * to HIGHATOMIC reserves as failing now is worse than + * failing a high-order atomic allocation in the + * future. + */ + if (!page && (alloc_flags & ALLOC_OOM)) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { spin_unlock_irqrestore(&zone->lock, flags); return NULL; @@ -3954,15 +3970,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); static inline long __zone_watermark_unusable_free(struct zone *z, unsigned int order, unsigned int alloc_flags) { - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); long unusable_free = (1 << order) - 1; /* - * If the caller does not have rights to ALLOC_HARDER then subtract - * the high-atomic reserves. This will over-estimate the size of the - * atomic reserve but it avoids a search. + * If the caller does not have rights to reserves below the min + * watermark then subtract the high-atomic reserves. This will + * over-estimate the size of the atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!(alloc_flags & ALLOC_RESERVES))) unusable_free += z->nr_reserved_highatomic; #ifdef CONFIG_CMA @@ -3986,25 +4001,37 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); - if (alloc_flags & ALLOC_HIGH) - min -= min / 2; + if (unlikely(alloc_flags & ALLOC_RESERVES)) { + /* + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ + if (alloc_flags & ALLOC_MIN_RESERVE) { + min -= min / 2; + + /* + * Non-blocking allocations (e.g. GFP_ATOMIC) can + * access more reserves than just __GFP_HIGH. Other + * non-blocking allocations requests such as GFP_NOWAIT + * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get + * access to the min reserve. + */ + if (alloc_flags & ALLOC_NON_BLOCK) + min -= min / 4; + } - if (unlikely(alloc_harder)) { /* - * OOM victims can try even harder than normal ALLOC_HARDER + * OOM victims can try even harder than the normal reserve * users on the grounds that it's definitely going to be in * the exit path shortly and free memory. Any allocation it * makes during the free path will be small and short-lived. */ if (alloc_flags & ALLOC_OOM) min -= min / 2; - else - min -= min / 4; } /* @@ -4038,8 +4065,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif - if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && + !free_area_empty(area, MIGRATE_HIGHATOMIC)) { return true; + } } return false; } @@ -4079,13 +4108,14 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, free_pages)) return true; + /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for __GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_MIN_RESERVE) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4259,7 +4289,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * Watermark failed for this zone, but see if we can * grow this zone if it contains deferred pages. */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4301,14 +4331,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * If this is a high-order atomic allocation then check * if the pageblock should be reserved for the future */ - if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) reserve_highatomic_pageblock(page, zone, order); return page; } else { #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ - if (static_branch_unlikely(&deferred_pages)) { + if (deferred_pages_enabled()) { if (_deferred_grow_zone(zone, order)) goto try_this_zone; } @@ -4828,41 +4858,48 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, } static inline unsigned int -gfp_to_alloc_flags(gfp_t gfp_mask) +gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) { unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* - * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD * to save two branches. */ - BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); /* * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. */ - if (!(gfp_mask & __GFP_NOMEMALLOC)) - alloc_flags |= ALLOC_HARDER; + if (!(gfp_mask & __GFP_NOMEMALLOC)) { + alloc_flags |= ALLOC_NON_BLOCK; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + /* - * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the - * comment for __cpuset_node_allowed(). + * Ignore cpuset mems for non-blocking __GFP_HIGH (probably + * GFP_ATOMIC) rather than fail, see the comment for + * __cpuset_node_allowed(). */ - alloc_flags &= ~ALLOC_CPUSET; + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; } else if (unlikely(rt_task(current)) && in_task()) - alloc_flags |= ALLOC_HARDER; + alloc_flags |= ALLOC_MIN_RESERVE; alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); @@ -5043,14 +5080,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; @@ -5063,7 +5092,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * kswapd needs to be woken up, and to avoid the cost of setting up * alloc_flags precisely. So we do that now. */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); + alloc_flags = gfp_to_alloc_flags(gfp_mask, order); /* * We need to recalculate the starting point for the zonelist iterator @@ -5291,12 +5320,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* - * Help non-failing allocations by giving them access to memory - * reserves but do not use ALLOC_NO_WATERMARKS because this + * Help non-failing allocations by giving some access to memory + * reserves normally used for high priority non-blocking + * allocations but do not use ALLOC_NO_WATERMARKS because this * could deplete whole memory reserves which would just make - * the situation worse + * the situation worse. */ - page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); if (page) goto got_pg; @@ -6776,8 +6806,10 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, zone_end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) { + deferred_struct_pages = true; break; + } } page = pfn_to_page(pfn); @@ -8584,7 +8616,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) struct zone *zone; lru_add_drain_cpu(cpu); - mlock_page_drain_remote(cpu); + mlock_drain_remote(cpu); drain_pages(cpu); /* diff --git a/mm/page_ext.c b/mm/page_ext.c index 4ee522fd381cb..dc1626be458bf 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -71,6 +71,7 @@ static bool need_page_idle(void) } static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, + .need_shared_flags = true, }; #endif @@ -86,12 +87,12 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #endif }; -unsigned long page_ext_size = sizeof(struct page_ext); +unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); -bool early_page_ext; +bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) { early_page_ext = true; @@ -106,7 +107,16 @@ static bool __init invoke_need_callbacks(void) bool need = false; for (i = 0; i < entries; i++) { - if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need()) { + if (page_ext_ops[i]->need_shared_flags) { + page_ext_size = sizeof(struct page_ext); + break; + } + } + } + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need()) { page_ext_ops[i]->offset = page_ext_size; page_ext_size += page_ext_ops[i]->size; need = true; diff --git a/mm/page_io.c b/mm/page_io.c index 905d9fcc0c969..a805117f7fd73 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -27,7 +27,7 @@ #include #include "swap.h" -static void end_swap_bio_write(struct bio *bio) +static void __end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -48,13 +48,17 @@ static void end_swap_bio_write(struct bio *bio) ClearPageReclaim(page); } end_page_writeback(page); +} + +static void end_swap_bio_write(struct bio *bio) +{ + __end_swap_bio_write(bio); bio_put(bio); } -static void end_swap_bio_read(struct bio *bio) +static void __end_swap_bio_read(struct bio *bio) { struct page *page = bio_first_page_all(bio); - struct task_struct *waiter = bio->bi_private; if (bio->bi_status) { SetPageError(page); @@ -62,18 +66,16 @@ static void end_swap_bio_read(struct bio *bio) pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); - goto out; + } else { + SetPageUptodate(page); } - - SetPageUptodate(page); -out: unlock_page(page); - WRITE_ONCE(bio->bi_private, NULL); +} + +static void end_swap_bio_read(struct bio *bio) +{ + __end_swap_bio_read(bio); bio_put(bio); - if (waiter) { - blk_wake_io_task(waiter); - put_task_struct(waiter); - } } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -180,11 +182,11 @@ int generic_swapfile_activate(struct swap_info_struct *sis, int swap_writepage(struct page *page, struct writeback_control *wbc) { struct folio *folio = page_folio(page); - int ret = 0; + int ret; if (folio_free_swap(folio)) { folio_unlock(folio); - goto out; + return 0; } /* * Arch code may have to preserve more data than just the page @@ -194,17 +196,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) if (ret) { folio_mark_dirty(folio); folio_unlock(folio); - goto out; + return ret; } if (frontswap_store(&folio->page) == 0) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); - goto out; + return 0; } - ret = __swap_writepage(&folio->page, wbc); -out: - return ret; + __swap_writepage(&folio->page, wbc); + return 0; } static inline void count_swpout_vm_event(struct page *page) @@ -291,7 +292,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) mempool_free(sio, sio_pool); } -static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) { struct swap_iocb *sio = NULL; struct swap_info_struct *sis = page_swap_info(page); @@ -328,30 +329,33 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) } if (wbc->swap_plug) *wbc->swap_plug = sio; - - return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc) +static void swap_writepage_bdev_sync(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) { - struct bio *bio; - int ret; - struct swap_info_struct *sis = page_swap_info(page); + struct bio_vec bv; + struct bio bio; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - /* - * ->flags can be updated non-atomicially (scan_swap_map_slots), - * but that will never affect SWP_FS_OPS, so the data_race - * is safe. - */ - if (data_race(sis->flags & SWP_FS_OPS)) - return swap_writepage_fs(page, wbc); + bio_init(&bio, sis->bdev, &bv, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); - ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); - if (!ret) { - count_swpout_vm_event(page); - return 0; - } + bio_associate_blkg_from_page(&bio, page); + count_swpout_vm_event(page); + + set_page_writeback(page); + unlock_page(page); + + submit_bio_wait(&bio); + __end_swap_bio_write(&bio); +} + +static void swap_writepage_bdev_async(struct page *page, + struct writeback_control *wbc, struct swap_info_struct *sis) +{ + struct bio *bio; bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), @@ -365,8 +369,24 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc) set_page_writeback(page); unlock_page(page); submit_bio(bio); +} - return 0; +void __swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + swap_writepage_fs(page, wbc); + else if (sis->flags & SWP_SYNCHRONOUS_IO) + swap_writepage_bdev_sync(page, wbc, sis); + else + swap_writepage_bdev_async(page, wbc, sis); } void swap_write_unplug(struct swap_iocb *sio) @@ -444,11 +464,41 @@ static void swap_readpage_fs(struct page *page, *plug = sio; } -int swap_readpage(struct page *page, bool synchronous, - struct swap_iocb **plug) +static void swap_readpage_bdev_sync(struct page *page, + struct swap_info_struct *sis) +{ + struct bio_vec bv; + struct bio bio; + + bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = swap_page_sector(page); + bio_add_page(&bio, page, thp_size(page), 0); + /* + * Keep this task valid during swap readpage because the oom killer may + * attempt to access it in the page fault retry time check. + */ + get_task_struct(current); + count_vm_event(PSWPIN); + submit_bio_wait(&bio); + __end_swap_bio_read(&bio); + put_task_struct(current); +} + +static void swap_readpage_bdev_async(struct page *page, + struct swap_info_struct *sis) { struct bio *bio; - int ret = 0; + + bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + count_vm_event(PSWPIN); + submit_bio(bio); +} + +void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) +{ struct swap_info_struct *sis = page_swap_info(page); bool workingset = PageWorkingset(page); unsigned long pflags; @@ -472,55 +522,19 @@ int swap_readpage(struct page *page, bool synchronous, if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); - goto out; - } - - if (data_race(sis->flags & SWP_FS_OPS)) { + } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); - goto out; - } - - if (sis->flags & SWP_SYNCHRONOUS_IO) { - ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); - if (!ret) { - count_vm_event(PSWPIN); - goto out; - } - } - - ret = 0; - bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); - bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_swap_bio_read; - bio_add_page(bio, page, thp_size(page), 0); - /* - * Keep this task valid during swap readpage because the oom killer may - * attempt to access it in the page fault retry time check. - */ - if (synchronous) { - get_task_struct(current); - bio->bi_private = current; - } - count_vm_event(PSWPIN); - bio_get(bio); - submit_bio(bio); - while (synchronous) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio->bi_private)) - break; - - blk_io_schedule(); + } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { + swap_readpage_bdev_sync(page, sis); + } else { + swap_readpage_bdev_async(page, sis); } - __set_current_state(TASK_RUNNING); - bio_put(bio); -out: if (workingset) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } delayacct_swapin_end(); - return ret; } void __swap_read_unplug(struct swap_iocb *sio) diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d27f532df4c1..220cdeddc2950 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -48,7 +48,7 @@ static int __init early_page_owner_param(char *buf) int ret = kstrtobool(buf, &page_owner_enabled); if (page_owner_enabled) - stack_depot_want_early_init(); + stack_depot_request_early_init(); return ret; } @@ -99,6 +99,7 @@ struct page_ext_operations page_owner_ops = { .size = sizeof(struct page_owner), .need = need_page_owner, .init = init_page_owner, + .need_shared_flags = true, }; static inline struct page_owner *get_page_owner(struct page_ext *page_ext) @@ -162,6 +163,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, { struct page_owner *page_owner; int i; + u64 ts_nsec = local_clock(); for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); @@ -171,7 +173,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; page_owner->tgid = current->tgid; - page_owner->ts_nsec = local_clock(); + page_owner->ts_nsec = ts_nsec; strscpy(page_owner->comm, current->comm, sizeof(page_owner->comm)); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 93e633c1d5877..25d8610c00429 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -45,6 +45,7 @@ struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, + .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) diff --git a/mm/readahead.c b/mm/readahead.c index b10f0cf81d804..47afbca1d122e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -801,21 +801,25 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; - ractl->_index = page->index; + ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); @@ -824,19 +828,20 @@ void readahead_expand(struct readahead_control *ractl, /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; - struct page *page = xa_load(&mapping->i_pages, index); + struct folio *folio = xa_load(&mapping->i_pages, index); - if (page && !xa_is_value(page)) - return; /* Page apparently present */ + if (folio && !xa_is_value(folio)) + return; /* Folio apparently present */ - page = __page_cache_alloc(gfp_mask); - if (!page) + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) return; - if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { - put_page(page); + if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { + folio_put(folio); return; } - if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + if (unlikely(folio_test_workingset(folio)) && + !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } diff --git a/mm/rmap.c b/mm/rmap.c index ab74e0547a528..8287f2cc327d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -262,11 +262,12 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and - * anon_vma_fork(). The first three want an exact copy of src, while the last - * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent - * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, - * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), + * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, + * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to + * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before + * call, we can identify this case by checking (!dst->anon_vma && + * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. @@ -944,9 +945,8 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, - 0, vma, vma->vm_mm, address, - vma_address_end(pvmw)); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { @@ -1079,26 +1079,26 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -int total_compound_mapcount(struct page *head) +int folio_total_mapcount(struct folio *folio) { - int mapcount = head_compound_mapcount(head); - int nr_subpages; + int mapcount = folio_entire_mapcount(folio); + int nr_pages; int i; - /* In the common case, avoid the loop when no subpages mapped by PTE */ - if (head_subpages_mapcount(head) == 0) + /* In the common case, avoid the loop when no pages mapped by PTE */ + if (folio_nr_pages_mapped(folio) == 0) return mapcount; /* - * Add all the PTE mappings of those subpages mapped by PTE. - * Limit the loop, knowing that only subpages_mapcount are mapped? + * Add all the PTE mappings of those pages mapped by PTE. + * Limit the loop to folio_nr_pages_mapped()? * Perhaps: given all the raciness, that may be a good or a bad idea. */ - nr_subpages = thp_nr_pages(head); - for (i = 0; i < nr_subpages; i++) - mapcount += atomic_read(&head[i]._mapcount); + nr_pages = folio_nr_pages(folio); + for (i = 0; i < nr_pages; i++) + mapcount += atomic_read(&folio_page(folio, i)->_mapcount); /* But each of those _mapcounts was based on -1 */ - mapcount += nr_subpages; + mapcount += nr_pages; return mapcount; } @@ -1132,19 +1132,20 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page or Hugepage to add to rmap + * @folio: Folio which contains page. + * @page: Page to add to rmap. * @vma: VM area to add page to. * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ -static void __page_set_anon_rmap(struct page *page, +static void __page_set_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); - if (PageAnon(page)) + if (folio_test_anon(folio)) goto out; /* @@ -1156,14 +1157,14 @@ static void __page_set_anon_rmap(struct page *page, anon_vma = anon_vma->root; /* - * page_idle does a lockless/optimistic rmap scan on page->mapping. + * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - page->index = linear_page_index(vma, address); + WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); + folio->index = linear_page_index(vma, address); out: if (exclusive) SetPageAnonExclusive(page); @@ -1208,10 +1209,11 @@ static void __page_check_anon_rmap(struct page *page, * and to ensure that PageAnon is not being upgraded racily to PageKsm * (but PageKsm is never downgraded to PageAnon). */ -void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, rmap_t flags) +void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first = true; @@ -1220,21 +1222,19 @@ void page_add_anon_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr_pmdmapped = folio_nr_pages(folio); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1249,58 +1249,57 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (nr_pmdmapped) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - if (likely(!PageKsm(page))) { - /* address might be in next vma when migration races vma_adjust */ + if (likely(!folio_test_ksm(folio))) { + /* address might be in next vma when migration races vma_merge */ if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(page, vma, address); } - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** - * page_add_new_anon_rmap - add mapping to a new anonymous page - * @page: the page to add the mapping to + * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. + * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * If it's a compound page, it is accounted as a compound page. As the page - * is new, it's assume to get mapped exclusively by a single process. - * - * Same as page_add_anon_rmap but must only be called on *new* pages. + * Like page_add_anon_rmap() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. - * Page does not have to be locked. + * The folio does not have to be locked. + * + * If the folio is large, it is accounted as a THP. As the folio + * is new, it's assumed to be mapped exclusively by a single process. */ -void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) +void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, + unsigned long address) { int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - __SetPageSwapBacked(page); + __folio_set_swapbacked(folio); - if (likely(!PageCompound(page))) { + if (likely(!folio_test_pmd_mappable(folio))) { /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); + atomic_set(&folio->_mapcount, 0); nr = 1; } else { - VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); - nr = thp_nr_pages(page); - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + atomic_set(&folio->_entire_mapcount, 0); + atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); + nr = folio_nr_pages(folio); + __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); } - __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(page, vma, address, 1); + __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } /** @@ -1311,10 +1310,11 @@ void page_add_new_anon_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool first; @@ -1324,21 +1324,19 @@ void page_add_file_rmap(struct page *page, if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; - if (first && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (first && folio_test_large(folio)) { nr = atomic_inc_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + first = atomic_inc_and_test(&folio->_entire_mapcount); if (first) { - mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr_pmdmapped = folio_nr_pages(folio); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; @@ -1350,12 +1348,12 @@ void page_add_file_rmap(struct page *page, } if (nr_pmdmapped) - __mod_lruvec_page_state(page, PageSwapBacked(page) ? + __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_page(page, vma, compound); + mlock_vma_folio(folio, vma, compound); } /** @@ -1366,19 +1364,21 @@ void page_add_file_rmap(struct page *page, * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) +void page_remove_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) { - atomic_t *mapped; + struct folio *folio = page_folio(page); + atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool last; + enum node_stat_item idx; VM_BUG_ON_PAGE(compound && !PageHead(page), page); /* Hugetlb pages are not counted in NR_*MAPPED */ - if (unlikely(PageHuge(page))) { + if (unlikely(folio_test_hugetlb(folio))) { /* hugetlb pages are always mapped with pmds */ - atomic_dec(compound_mapcount_ptr(page)); + atomic_dec(&folio->_entire_mapcount); return; } @@ -1386,21 +1386,19 @@ void page_remove_rmap(struct page *page, if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; - if (last && PageCompound(page)) { - mapped = subpages_mapcount_ptr(compound_head(page)); + if (last && folio_test_large(folio)) { nr = atomic_dec_return_relaxed(mapped); nr = (nr < COMPOUND_MAPPED); } - } else if (PageTransHuge(page)) { + } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ - last = atomic_add_negative(-1, compound_mapcount_ptr(page)); + last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { - mapped = subpages_mapcount_ptr(page); nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); if (likely(nr < COMPOUND_MAPPED)) { - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + nr_pmdmapped = folio_nr_pages(folio); + nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; @@ -1412,32 +1410,37 @@ void page_remove_rmap(struct page *page, } if (nr_pmdmapped) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : - (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : - NR_FILE_PMDMAPPED), -nr_pmdmapped); + if (folio_test_anon(folio)) + idx = NR_ANON_THPS; + else if (folio_test_swapbacked(folio)) + idx = NR_SHMEM_PMDMAPPED; + else + idx = NR_FILE_PMDMAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); } if (nr) { - __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : - NR_FILE_MAPPED, -nr); + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, -nr); + /* - * Queue anon THP for deferred split if at least one small - * page of the compound page is unmapped, but at least one - * small page is still mapped. + * Queue anon THP for deferred split if at least one + * page of the folio is unmapped and at least one page + * is still mapped. */ - if (PageTransCompound(page) && PageAnon(page)) + if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) if (!compound || nr < nr_pmdmapped) - deferred_split_huge_page(compound_head(page)); + deferred_split_folio(folio); } /* - * It would be tidy to reset PageAnon mapping when fully unmapped, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_pages_prepare, - * and remember that it's only reliable while mapped. + * It would be tidy to reset folio_test_anon mapping when fully + * unmapped, but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping before us: + * so leave the reset to free_pages_prepare, and remember that + * it's only reliable while mapped. */ - munlock_vma_page(page, vma, compound); + munlock_vma_folio(folio, vma, compound); } /* @@ -1475,7 +1478,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -1709,17 +1712,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, page_vma_mapped_walk_done(&pvmw); break; } - /* - * Note: We *don't* remember if the page was mapped - * exclusively in the swap pte if the architecture - * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In - * that case, swapin code has to re-determine that - * manually and might detect the page as possibly - * shared, for example, if there are other references on - * the page or if the page is under writeback. We made - * sure that there are no GUP pins on the page that - * would rely on it, so for GUP pins this is fine. - */ if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) @@ -1763,7 +1755,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } @@ -1850,7 +1842,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* @@ -2104,7 +2096,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) - mlock_page_drain_local(); + mlock_drain_local(); folio_put(folio); } @@ -2180,7 +2172,7 @@ static bool page_make_device_exclusive_one(struct folio *folio, swp_entry_t entry; pte_t swp_pte; - mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, address, min(vma->vm_end, address + folio_size(folio)), args->owner); @@ -2527,27 +2519,28 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + struct folio *folio = page_folio(page); struct anon_vma *anon_vma = vma->anon_vma; int first; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(compound_mapcount_ptr(page)); + /* address might be in next vma when migration races vma_merge */ + first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) - __page_set_anon_rmap(page, vma, address, + __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); } -void hugepage_add_new_anon_rmap(struct page *page, +void hugepage_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ - atomic_set(compound_mapcount_ptr(page), 0); - ClearHPageRestoreReserve(page); - __page_set_anon_rmap(page, vma, address, 1); + atomic_set(&folio->_entire_mapcount, 0); + folio_clear_hugetlb_restore_reserve(folio); + __page_set_anon_rmap(folio, &folio->page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/secretmem.c b/mm/secretmem.c index afcf46e99cda5..0b502625cd30e 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -128,7 +128,7 @@ static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; - vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; @@ -190,7 +190,7 @@ static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { - struct file *file = ERR_PTR(-ENOMEM); + struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); diff --git a/mm/shmem.c b/mm/shmem.c index f186a69b05c73..28f3c699c8ce7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -468,15 +468,14 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; @@ -493,7 +492,7 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, return true; fallthrough; case SHMEM_HUGE_ADVISE: - if (vma && (vma->vm_flags & VM_HUGEPAGE)) + if (mm && (vm_flags & VM_HUGEPAGE)) return true; fallthrough; default: @@ -676,8 +675,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, - pgoff_t index, bool shmem_huge_force) +bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags) { return false; } @@ -1068,7 +1067,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, STATX_ATTR_NODUMP); generic_fillattr(idmap, inode, stat); - if (shmem_is_huge(NULL, inode, 0, false)) + if (shmem_is_huge(inode, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1926,7 +1925,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - if (!shmem_is_huge(vma, inode, index, false)) + if (!shmem_is_huge(inode, index, false, + vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); @@ -2304,7 +2304,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return ret; /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; + vm_flags_set(vma, VM_MTE_ALLOWED); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ @@ -2579,33 +2579,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - if (!PageUptodate(page)) { - struct page *head = compound_head(page); - if (PageTransCompound(page)) { - int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) { - if (head + i == page) - continue; - clear_highpage(head + i); - flush_dcache_page(head + i); - } - } - if (copied < PAGE_SIZE) { - unsigned from = pos & (PAGE_SIZE - 1); - zero_user_segments(page, 0, from, - from + copied, PAGE_SIZE); + if (!folio_test_uptodate(folio)) { + if (copied < folio_size(folio)) { + size_t from = offset_in_folio(folio, pos); + folio_zero_segments(folio, 0, from, + from + copied, folio_size(folio)); } - SetPageUptodate(head); + folio_mark_uptodate(folio); } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } diff --git a/mm/slub.c b/mm/slub.c index d5f20c0620049..05a7d139bedcc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1592,7 +1592,7 @@ static int __init setup_slub_debug(char *str) } else { slab_list_specified = true; if (flags & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); } } @@ -1611,7 +1611,7 @@ static int __init setup_slub_debug(char *str) out: slub_debug = global_flags; if (slub_debug & SLAB_STORE_USER) - stack_depot_want_early_init(); + stack_depot_request_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else diff --git a/mm/sparse.c b/mm/sparse.c index 2779b419ef2a2..fb7aeb1899a45 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -318,6 +318,7 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } +#ifdef CONFIG_MEMORY_HOTREMOVE static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { #ifndef CONFIG_NUMA @@ -328,7 +329,6 @@ static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) #endif } -#ifdef CONFIG_MEMORY_HOTREMOVE static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) diff --git a/mm/swap.c b/mm/swap.c index e54e2a252e27e..2a51faa34e641 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -201,7 +201,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears the mlocked flag + * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily @@ -216,7 +216,7 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another + * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. @@ -562,7 +562,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(&folio->page); + mlock_new_folio(folio); else folio_add_lru(folio); } @@ -781,7 +781,7 @@ void lru_add_drain(void) local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } /* @@ -796,7 +796,7 @@ static void lru_add_and_bh_lrus_drain(void) lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); - mlock_page_drain_local(); + mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) @@ -805,7 +805,7 @@ void lru_add_drain_cpu_zone(struct zone *zone) lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); - mlock_page_drain_local(); + mlock_drain_local(); } #ifdef CONFIG_SMP @@ -828,7 +828,7 @@ static bool cpu_needs_drain(unsigned int cpu) folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || - need_mlock_page_drain(cpu) || + need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } @@ -1115,16 +1115,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) fbatch->nr = j; } -unsigned pagevec_lookup_range_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag) -{ - pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - PAGEVEC_SIZE, pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range_tag); - /* * Perform any setup for the swap system */ diff --git a/mm/swap.h b/mm/swap.h index f78065c8ef524..7c033d793f15f 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -8,8 +8,7 @@ /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; -int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug); +void swap_readpage(struct page *page, bool do_poll, struct swap_iocb **plug); void __swap_read_unplug(struct swap_iocb *plug); static inline void swap_read_unplug(struct swap_iocb *plug) { @@ -18,7 +17,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -int __swap_writepage(struct page *page, struct writeback_control *wbc); +void __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ @@ -64,10 +63,9 @@ static inline unsigned int folio_swap_flags(struct folio *folio) } #else /* CONFIG_SWAP */ struct swap_iocb; -static inline int swap_readpage(struct page *page, bool do_poll, - struct swap_iocb **plug) +static inline void swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) { - return 0; } static inline void swap_write_unplug(struct swap_iocb *sio) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cb9aaa00951d9..7a003d8abb37b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -94,6 +94,8 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, unsigned long i, nr = folio_nr_pages(folio); void *old; + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); @@ -145,6 +147,8 @@ void __delete_from_swap_cache(struct folio *folio, pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); + xas_set_update(&xas, workingset_update_node); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); @@ -252,6 +256,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); + xas_set_update(&xas, workingset_update_node); + xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5f0e468e5a34d..99143875d6f02 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1098,8 +1098,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) spin_unlock(&si->lock); if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; - pr_debug("scan_swap_map of si %d failed to find offset\n", - si->type); cond_resched(); spin_lock(&swap_avail_lock); @@ -3079,7 +3077,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; - if (p->bdev && p->bdev->bd_disk->fops->rw_page) + if (p->bdev && bdev_synchronous(p->bdev)) p->flags |= SWP_SYNCHRONOUS_IO; if (p->bdev && bdev_nonrot(p->bdev)) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 428e0bee5c9c7..73190fc08845d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,17 +89,6 @@ struct vfree_deferred { }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); -static void __vunmap(const void *, int); - -static void free_work(struct work_struct *w) -{ - struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *t, *llnode; - - llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); -} - /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -1589,7 +1578,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, gfp_t gfp_mask, + unsigned long va_flags) { struct vmap_area *va; unsigned long freed; @@ -1635,6 +1625,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->va_start = addr; va->va_end = addr + size; va->vm = NULL; + va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); @@ -1913,6 +1904,10 @@ static struct vmap_area *find_unlink_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1922,6 +1917,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; @@ -1987,7 +1983,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, - node, gfp_mask); + node, gfp_mask, + VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -1998,10 +1995,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; + bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -2111,6 +2110,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; + bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -2144,6 +2144,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); @@ -2290,7 +2293,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, + node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; @@ -2434,48 +2438,6 @@ static void vmap_init_free_space(void) } } -void __init vmalloc_init(void) -{ - struct vmap_area *va; - struct vm_struct *tmp; - int i; - - /* - * Create the cache for vmap_area objects. - */ - vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); - - for_each_possible_cpu(i) { - struct vmap_block_queue *vbq; - struct vfree_deferred *p; - - vbq = &per_cpu(vmap_block_queue, i); - spin_lock_init(&vbq->lock); - INIT_LIST_HEAD(&vbq->free); - p = &per_cpu(vfree_deferred, i); - init_llist_head(&p->list); - INIT_WORK(&p->wq, free_work); - } - - /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (WARN_ON_ONCE(!va)) - continue; - - va->va_start = (unsigned long)tmp->addr; - va->va_end = va->va_start + tmp->size; - va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - } - - /* - * Now we can initialize a free vmap space. - */ - vmap_init_free_space(); - vmap_initialized = true; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2530,7 +2492,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; @@ -2609,20 +2571,6 @@ struct vm_struct *find_vm_area(const void *addr) return va->vm; } -static struct vm_struct *__remove_vm_area(struct vmap_area *va) -{ - struct vm_struct *vm; - - if (!va || !va->vm) - return NULL; - - vm = va->vm; - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); - - return vm; -} - /** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address @@ -2635,10 +2583,27 @@ static struct vm_struct *__remove_vm_area(struct vmap_area *va) */ struct vm_struct *remove_vm_area(const void *addr) { + struct vmap_area *va; + struct vm_struct *vm; + might_sleep(); - return __remove_vm_area( - find_unlink_vmap_area((unsigned long) addr)); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return NULL; + + va = find_unlink_vmap_area((unsigned long)addr); + if (!va || !va->vm) + return NULL; + vm = va->vm; + + debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); + debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); + kasan_free_module_shadow(vm); + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); + + free_unmap_vmap_area(va); + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2652,38 +2617,23 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the VA's vm_struct. */ -static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) +/* + * Flush the vm mapping and reset the direct map. + */ +static void vm_reset_perms(struct vm_struct *area) { - struct vm_struct *area = va->vm; unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); - int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - __remove_vm_area(va); - - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ - if (!flush_reset) - return; - /* - * If not deallocating pages, just do the flush of the VM area and - * return. - */ - if (!deallocate_pages) { - vm_unmap_aliases(); - return; - } - - /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2704,68 +2654,13 @@ static void va_remove_mappings(struct vmap_area *va, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } -static void __vunmap(const void *addr, int deallocate_pages) +static void delayed_vfree_work(struct work_struct *w) { - struct vm_struct *area; - struct vmap_area *va; - - if (!addr) - return; - - if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)) - return; - - va = find_unlink_vmap_area((unsigned long)addr); - if (unlikely(!va)) { - WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", - addr); - return; - } - - area = va->vm; - debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); - debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - - kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - - va_remove_mappings(va, deallocate_pages); - - if (deallocate_pages) { - int i; - - for (i = 0; i < area->nr_pages; i++) { - struct page *page = area->pages[i]; - - BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); - cond_resched(); - } - atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); - - kvfree(area->pages); - } - - kfree(area); -} - -static inline void __vfree_deferred(const void *addr) -{ - /* - * Use raw_cpu_ptr() because this can be called from preemptible - * context. Preemption is absolutely fine here, because the llist_add() - * implementation is lockless, so it works even if we are adding to - * another cpu's list. schedule_work() should be fine with this too. - */ - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + vfree(llnode); } /** @@ -2777,21 +2672,19 @@ static inline void __vfree_deferred(const void *addr) */ void vfree_atomic(const void *addr) { - BUG_ON(in_nmi()); + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + BUG_ON(in_nmi()); kmemleak_free(addr); - if (!addr) - return; - __vfree_deferred(addr); -} - -static void __vfree(const void *addr) -{ - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + if (addr && llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } /** @@ -2813,16 +2706,45 @@ static void __vfree(const void *addr) */ void vfree(const void *addr) { - BUG_ON(in_nmi()); + struct vm_struct *vm; + int i; - kmemleak_free(addr); + if (unlikely(in_interrupt())) { + vfree_atomic(addr); + return; + } - might_sleep_if(!in_interrupt()); + BUG_ON(in_nmi()); + kmemleak_free(addr); + might_sleep(); if (!addr) return; - __vfree(addr); + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) + vm_reset_perms(vm); + for (i = 0; i < vm->nr_pages; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + kvfree(vm->pages); + kfree(vm); } EXPORT_SYMBOL(vfree); @@ -2837,10 +2759,20 @@ EXPORT_SYMBOL(vfree); */ void vunmap(const void *addr) { + struct vm_struct *vm; + BUG_ON(in_interrupt()); might_sleep(); - if (addr) - __vunmap(addr, 0); + + if (!addr) + return; + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", + addr); + return; + } + kfree(vm); } EXPORT_SYMBOL(vunmap); @@ -2868,6 +2800,9 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) + return NULL; + /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. @@ -3086,7 +3021,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* * If not enough pages were obtained to accomplish an - * allocation request, free them via __vfree() if any. + * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, @@ -3126,7 +3061,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - __vfree(area->addr); + vfree(area->addr); return NULL; } @@ -3529,6 +3464,65 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + memset(buf, 0, count); + return; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + break; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } + spin_unlock(&vb->lock); + + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3559,7 +3553,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3580,12 +3574,21 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + if (vm && (vm->flags & VM_UNINITIALIZED)) + continue; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3595,10 +3598,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); @@ -3676,7 +3682,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -4144,11 +4150,7 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); - /* - * s_show can encounter race with remove_vm_area, !vm on behalf - * of vmap area is being tear down or vm_map_ram allocation. - */ - if (!va->vm) { + if (!va->vm && (va->flags & VMAP_RAM)) { seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); @@ -4221,3 +4223,46 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, delayed_vfree_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} + diff --git a/mm/vmscan.c b/mm/vmscan.c index 394ff4962cbcf..ac51150d2d364 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3233,6 +3233,98 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } +/****************************************************************************** + * Bloom filters + ******************************************************************************/ + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + /****************************************************************************** * mm_struct list ******************************************************************************/ @@ -3352,94 +3444,6 @@ void lru_gen_migrate_mm(struct mm_struct *mm) } #endif -/* - * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when - * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of - * bits in a bitmap, k is the number of hash functions and n is the number of - * inserted items. - * - * Page table walkers use one of the two filters to reduce their search space. - * To get rid of non-leaf entries that no longer have enough leaf entries, the - * aging uses the double-buffering technique to flip to the other filter each - * time it produces a new generation. For non-leaf entries that have enough - * leaf entries, the aging carries them over to the next generation in - * walk_pmd_range(); the eviction also report them when walking the rmap - * in lru_gen_look_around(). - * - * For future optimizations: - * 1. It's not necessary to keep both filters all the time. The spare one can be - * freed after the RCU grace period and reallocated if needed again. - * 2. And when reallocating, it's worth scaling its size according to the number - * of inserted entries in the other filter, to reduce the memory overhead on - * small systems and false positives on large systems. - * 3. Jenkins' hash function is an alternative to Knuth's. - */ -#define BLOOM_FILTER_SHIFT 15 - -static inline int filter_gen_from_seq(unsigned long seq) -{ - return seq % NR_BLOOM_FILTERS; -} - -static void get_item_key(void *item, int *key) -{ - u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); - - BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); - - key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); - key[1] = hash >> BLOOM_FILTER_SHIFT; -} - -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) -{ - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = lruvec->mm_state.filters[gen]; - if (filter) { - bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); - return; - } - - filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); -} - -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return; - - get_item_key(item, key); - - if (!test_bit(key[0], filter)) - set_bit(key[0], filter); - if (!test_bit(key[1], filter)) - set_bit(key[1], filter); -} - -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) -{ - int key[2]; - unsigned long *filter; - int gen = filter_gen_from_seq(seq); - - filter = READ_ONCE(lruvec->mm_state.filters[gen]); - if (!filter) - return true; - - get_item_key(item, key); - - return test_bit(key[0], filter) && test_bit(key[1], filter); -} - static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; @@ -3995,8 +3999,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; pmd_t *pmd; @@ -4009,18 +4013,19 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area VM_WARN_ON_ONCE(pud_leaf(*pud)); /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ - if (*start == -1) { - *start = next; + if (*first == -1) { + *first = addr; + bitmap_zero(bitmap, MIN_LRU_BATCH); return; } - i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first); if (i && i <= MIN_LRU_BATCH) { __set_bit(i - 1, bitmap); return; } - pmd = pmd_offset(pud, *start); + pmd = pmd_offset(pud, *first); ptl = pmd_lockptr(args->mm, pmd); if (!spin_trylock(ptl)) @@ -4031,15 +4036,16 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area do { unsigned long pfn; struct folio *folio; - unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + /* don't round down the first address */ + addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; pfn = get_pmd_pfn(pmd[i], vma, addr); if (pfn == -1) goto next; if (!pmd_trans_huge(pmd[i])) { - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } @@ -4068,12 +4074,11 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: - *start = -1; - bitmap_zero(bitmap, MIN_LRU_BATCH); + *first = -1; } #else -static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, - struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { } #endif @@ -4086,9 +4091,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, unsigned long next; unsigned long addr; struct vm_area_struct *vma; - unsigned long pos = -1; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -4127,18 +4132,17 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; - if (arch_has_hw_nonleaf_pmd_young() && - get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) { if (!pmd_young(val)) continue; - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) @@ -4155,7 +4159,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); } - walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) goto restart; @@ -4475,6 +4479,10 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, return true; } +/****************************************************************************** + * working set protection + ******************************************************************************/ + static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; @@ -4565,6 +4573,10 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } } +/****************************************************************************** + * rmap/PT walk feedback + ******************************************************************************/ + /* * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If @@ -4575,13 +4587,12 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; - pte_t *pte; unsigned long start; unsigned long end; - unsigned long addr; struct lru_gen_mm_walk *walk; int young = 0; - unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + pte_t *pte = pvmw->pte; + unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -4598,25 +4609,28 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; - start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); - end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + start = max(addr & PMD_MASK, pvmw->vma->vm_start); + end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { - if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; - else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2) start = end - MIN_LRU_BATCH * PAGE_SIZE; else { - start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; - end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2; } } - pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; - rcu_read_lock(); arch_enter_lazy_mmu_mode(); + pte -= (addr - start) / PAGE_SIZE; + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { unsigned long pfn; @@ -4641,58 +4655,171 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) !folio_test_swapcache(folio))) folio_mark_dirty(folio); + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + + continue; + } + old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); else if (old_gen != new_gen) - __set_bit(i, bitmap); + folio_activate(folio); } arch_leave_lazy_mmu_mode(); - rcu_read_unlock(); + mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); +} - if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - folio_activate(folio); - } - return; +/****************************************************************************** + * memcg LRU + ******************************************************************************/ + +/* see the comment on MEMCG_NR_GENS */ +enum { + MEMCG_LRU_NOP, + MEMCG_LRU_HEAD, + MEMCG_LRU_TAIL, + MEMCG_LRU_OLD, + MEMCG_LRU_YOUNG, +}; + +#ifdef CONFIG_MEMCG + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return READ_ONCE(lruvec->lrugen.seg); +} + +static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) +{ + int seg; + int old, new; + int bin = get_random_u32_below(MEMCG_NR_BINS); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + seg = 0; + new = old = lruvec->lrugen.gen; + + /* see the comment on MEMCG_NR_GENS */ + if (op == MEMCG_LRU_HEAD) + seg = MEMCG_LRU_HEAD; + else if (op == MEMCG_LRU_TAIL) + seg = MEMCG_LRU_TAIL; + else if (op == MEMCG_LRU_OLD) + new = get_memcg_gen(pgdat->memcg_lru.seq); + else if (op == MEMCG_LRU_YOUNG) + new = get_memcg_gen(pgdat->memcg_lru.seq + 1); + else + VM_WARN_ON_ONCE(true); + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + + if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) + hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + else + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); + + pgdat->memcg_lru.nr_memcgs[old]--; + pgdat->memcg_lru.nr_memcgs[new]++; + + lruvec->lrugen.gen = new; + WRITE_ONCE(lruvec->lrugen.seg, seg); + + if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); +} + +void lru_gen_online_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; + int bin = get_random_u32_below(MEMCG_NR_BINS); + + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); + + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = get_memcg_gen(pgdat->memcg_lru.seq); + + hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); + pgdat->memcg_lru.nr_memcgs[gen]++; + + lruvec->lrugen.gen = gen; + + spin_unlock(&pgdat->memcg_lru.lock); } +} - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return; +void lru_gen_offline_memcg(struct mem_cgroup *memcg) +{ + int nid; - if (!walk) { - spin_lock_irq(&lruvec->lru_lock); - new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); } +} - for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { - folio = pfn_folio(pte_pfn(pte[i])); - if (folio_memcg_rcu(folio) != memcg) - continue; +void lru_gen_release_memcg(struct mem_cgroup *memcg) +{ + int gen; + int nid; - old_gen = folio_update_gen(folio, new_gen); - if (old_gen < 0 || old_gen == new_gen) - continue; + for_each_node(nid) { + struct pglist_data *pgdat = NODE_DATA(nid); + struct lruvec *lruvec = get_lruvec(memcg, nid); - if (walk) - update_batch_size(walk, folio, old_gen, new_gen); - else - lru_gen_update_size(lruvec, folio, old_gen, new_gen); + spin_lock(&pgdat->memcg_lru.lock); + + VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + + gen = lruvec->lrugen.gen; + + hlist_nulls_del_rcu(&lruvec->lrugen.list); + pgdat->memcg_lru.nr_memcgs[gen]--; + + if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) + WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); + + spin_unlock(&pgdat->memcg_lru.lock); } +} - if (!walk) - spin_unlock_irq(&lruvec->lru_lock); +void lru_gen_soft_reclaim(struct lruvec *lruvec) +{ + /* see the comment on MEMCG_NR_GENS */ + if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD) + lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD); +} - mem_cgroup_unlock_pages(); +#else /* !CONFIG_MEMCG */ + +static int lru_gen_memcg_seg(struct lruvec *lruvec) +{ + return 0; } +#endif + /****************************************************************************** * the eviction ******************************************************************************/ @@ -5385,53 +5512,6 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * pgdat->kswapd_failures = 0; } -#ifdef CONFIG_MEMCG -void lru_gen_rotate_memcg(struct lruvec *lruvec, int op) -{ - int seg; - int old, new; - int bin = get_random_u32_below(MEMCG_NR_BINS); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - seg = 0; - new = old = lruvec->lrugen.gen; - - /* see the comment on MEMCG_NR_GENS */ - if (op == MEMCG_LRU_HEAD) - seg = MEMCG_LRU_HEAD; - else if (op == MEMCG_LRU_TAIL) - seg = MEMCG_LRU_TAIL; - else if (op == MEMCG_LRU_OLD) - new = get_memcg_gen(pgdat->memcg_lru.seq); - else if (op == MEMCG_LRU_YOUNG) - new = get_memcg_gen(pgdat->memcg_lru.seq + 1); - else - VM_WARN_ON_ONCE(true); - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - - if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD) - hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - else - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]); - - pgdat->memcg_lru.nr_memcgs[old]--; - pgdat->memcg_lru.nr_memcgs[new]++; - - lruvec->lrugen.gen = new; - WRITE_ONCE(lruvec->lrugen.seg, seg); - - if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); -} -#endif - /****************************************************************************** * state change ******************************************************************************/ @@ -6061,12 +6141,17 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) int i; int nid; + VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + VM_WARN_ON_ONCE(lruvec->mm_state.nr_walkers); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); + lruvec->lrugen.list.next = LIST_POISON1; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(lruvec->mm_state.filters[i]); lruvec->mm_state.filters[i] = NULL; @@ -6074,67 +6159,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) } } -void lru_gen_online_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - int bin = get_random_u32_below(MEMCG_NR_BINS); - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = get_memcg_gen(pgdat->memcg_lru.seq); - - hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]); - pgdat->memcg_lru.nr_memcgs[gen]++; - - lruvec->lrugen.gen = gen; - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - -void lru_gen_offline_memcg(struct mem_cgroup *memcg) -{ - int nid; - - for_each_node(nid) { - struct lruvec *lruvec = get_lruvec(memcg, nid); - - lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD); - } -} - -void lru_gen_release_memcg(struct mem_cgroup *memcg) -{ - int gen; - int nid; - - for_each_node(nid) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct lruvec *lruvec = get_lruvec(memcg, nid); - - spin_lock(&pgdat->memcg_lru.lock); - - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); - - gen = lruvec->lrugen.gen; - - hlist_nulls_del_rcu(&lruvec->lrugen.list); - pgdat->memcg_lru.nr_memcgs[gen]--; - - if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) - WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - - spin_unlock(&pgdat->memcg_lru.lock); - } -} - #endif /* CONFIG_MEMCG */ static int __init init_lru_gen(void) diff --git a/mm/workingset.c b/mm/workingset.c index f194d13beabbd..00c6f4d9d9be5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -657,11 +657,14 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - if (!spin_trylock(&mapping->host->i_lock)) { - xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); - ret = LRU_RETRY; - goto out; + /* For page cache we need to hold i_lock */ + if (mapping->host != NULL) { + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } } list_lru_isolate(lru, item); @@ -683,9 +686,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, out_invalid: xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); + if (mapping->host != NULL) { + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } ret = LRU_REMOVED_RETRY; out: cond_resched(); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 9d27d9b00bceb..3aed46ab7e6cb 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -73,13 +73,6 @@ */ #define ZS_ALIGN 8 -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* @@ -136,10 +129,13 @@ #define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 -#define ISOLATED_BITS 3 +#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) + /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) @@ -822,42 +818,6 @@ static enum fullness_group fix_fullness_group(struct size_class *class, return newfg; } -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp % class_size - * usage = Zp - wastage - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - static struct zspage *get_zspage(struct page *page) { struct zspage *zspage = (struct zspage *)page_private(page); @@ -2401,6 +2361,27 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->name); } +static int calculate_zspage_chain_size(int class_size) +{ + int i, min_waste = INT_MAX; + int chain_size = 1; + + if (is_power_of_2(class_size)) + return chain_size; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int waste; + + waste = (i * PAGE_SIZE) % class_size; + if (waste < min_waste) { + min_waste = waste; + chain_size = i; + } + } + + return chain_size; +} + /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created @@ -2445,7 +2426,7 @@ struct zs_pool *zs_create_pool(const char *name) size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; - pages_per_zspage = get_pages_per_zspage(size); + pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c07ee4a025baf..288693981b006 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1891,10 +1891,10 @@ int tcp_mmap(struct file *file, struct socket *sock, { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; - vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ - vma->vm_flags |= VM_MIXEDMAP; + vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 0a0e4c283f02e..cfac54cbafdf9 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -1012,7 +1012,7 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, mask = flow_mask_find(tbl, new); if (!mask) { - /* Allocate a new mask if none exsits. */ + /* Allocate a new mask if none exists. */ mask = mask_alloc(); if (!mask) return -ENOMEM; diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index f9553f60a14a8..36303afa9dfc3 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -80,8 +80,7 @@ def calc(oldfile, newfile, format): if d<0: shrink, down = shrink+1, down-d delta.append((d, name)) - delta.sort() - delta.reverse() + delta.sort(reverse=True) return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot def print_result(symboltype, symbolformat): diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 78cc595b98ce1..bd44d12965c98 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -823,7 +823,9 @@ sub find_standard_signature { "get_state_synchronize_sched" => "get_state_synchronize_rcu", "cond_synchronize_sched" => "cond_synchronize_rcu", "kmap" => "kmap_local_page", + "kunmap" => "kunmap_local", "kmap_atomic" => "kmap_local_page", + "kunmap_atomic" => "kunmap_local", ); #Create a search pattern for all these strings to speed up a loop below @@ -3142,21 +3144,33 @@ sub process { if ($sign_off =~ /^co-developed-by:$/i) { if ($email eq $author) { WARN("BAD_SIGN_OFF", - "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . "$here\n" . $rawline); + "Co-developed-by: should not be used to attribute nominal patch author '$author'\n" . $herecurr); } if (!defined $lines[$linenr]) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline); - } elsif ($rawlines[$linenr] !~ /^\s*signed-off-by:\s*(.*)/i) { + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr); + } elsif ($rawlines[$linenr] !~ /^signed-off-by:\s*(.*)/i) { WARN("BAD_SIGN_OFF", - "Co-developed-by: must be immediately followed by Signed-off-by:\n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); + "Co-developed-by: must be immediately followed by Signed-off-by:\n" . $herecurr . $rawlines[$linenr] . "\n"); } elsif ($1 ne $email) { WARN("BAD_SIGN_OFF", - "Co-developed-by and Signed-off-by: name/email do not match \n" . "$here\n" . $rawline . "\n" .$rawlines[$linenr]); + "Co-developed-by and Signed-off-by: name/email do not match\n" . $herecurr . $rawlines[$linenr] . "\n"); + } + } + +# check if Reported-by: is followed by a Link: + if ($sign_off =~ /^reported(?:|-and-tested)-by:$/i) { + if (!defined $lines[$linenr]) { + WARN("BAD_REPORTED_BY_LINK", + "Reported-by: should be immediately followed by Link: to the report\n" . $herecurr . $rawlines[$linenr] . "\n"); + } elsif ($rawlines[$linenr] !~ m{^link:\s*https?://}i) { + WARN("BAD_REPORTED_BY_LINK", + "Reported-by: should be immediately followed by Link: with a URL to the report\n" . $herecurr . $rawlines[$linenr] . "\n"); } } } + # Check Fixes: styles is correct if (!$in_header_lines && $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { @@ -3250,6 +3264,18 @@ sub process { $commit_log_possible_stack_dump = 0; } +# Check for odd tags before a URI/URL + if ($in_commit_log && + $line =~ /^\s*(\w+):\s*http/ && $1 ne 'Link') { + if ($1 =~ /^v(?:ersion)?\d+/i) { + WARN("COMMIT_LOG_VERSIONING", + "Patch version information should be after the --- line\n" . $herecurr); + } else { + WARN("COMMIT_LOG_USE_LINK", + "Unknown link reference '$1:', use 'Link:' instead\n" . $herecurr); + } + } + # Check for lines starting with a # if ($in_commit_log && $line =~ /^#/) { if (WARN("COMMIT_COMMENT_SYMBOL", @@ -3725,7 +3751,7 @@ sub process { } # check for embedded filenames - if ($rawline =~ /^\+.*\Q$realfile\E/) { + if ($rawline =~ /^\+.*\b\Q$realfile\E\b/) { WARN("EMBEDDED_FILENAME", "It's generally not useful to have the filename in the file\n" . $herecurr); } diff --git a/scripts/gdb/linux/mm.py b/scripts/gdb/linux/mm.py new file mode 100644 index 0000000000000..30d837f3dfae1 --- /dev/null +++ b/scripts/gdb/linux/mm.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# gdb helper commands and functions for Linux kernel debugging +# +# routines to introspect page table +# +# Authors: +# Dmitrii Bundin +# + +import gdb + +from linux import utils + +PHYSICAL_ADDRESS_MASK = gdb.parse_and_eval('0xfffffffffffff') + + +def page_mask(level=1): + # 4KB + if level == 1: + return gdb.parse_and_eval('(u64) ~0xfff') + # 2MB + elif level == 2: + return gdb.parse_and_eval('(u64) ~0x1fffff') + # 1GB + elif level == 3: + return gdb.parse_and_eval('(u64) ~0x3fffffff') + else: + raise Exception(f'Unknown page level: {level}') + + +#page_offset_base in case CONFIG_DYNAMIC_MEMORY_LAYOUT is disabled +POB_NO_DYNAMIC_MEM_LAYOUT = '0xffff888000000000' +def _page_offset_base(): + pob_symbol = gdb.lookup_global_symbol('page_offset_base') + pob = pob_symbol.name if pob_symbol else POB_NO_DYNAMIC_MEM_LAYOUT + return gdb.parse_and_eval(pob) + + +def is_bit_defined_tupled(data, offset): + return offset, bool(data >> offset & 1) + +def content_tupled(data, bit_start, bit_end): + return (bit_start, bit_end), data >> bit_start & ((1 << (1 + bit_end - bit_start)) - 1) + +def entry_va(level, phys_addr, translating_va): + def start_bit(level): + if level == 5: + return 48 + elif level == 4: + return 39 + elif level == 3: + return 30 + elif level == 2: + return 21 + elif level == 1: + return 12 + else: + raise Exception(f'Unknown level {level}') + + entry_offset = ((translating_va >> start_bit(level)) & 511) * 8 + entry_va = _page_offset_base() + phys_addr + entry_offset + return entry_va + +class Cr3(): + def __init__(self, cr3, page_levels): + self.cr3 = cr3 + self.page_levels = page_levels + self.page_level_write_through = is_bit_defined_tupled(cr3, 3) + self.page_level_cache_disabled = is_bit_defined_tupled(cr3, 4) + self.next_entry_physical_address = cr3 & PHYSICAL_ADDRESS_MASK & page_mask() + + def next_entry(self, va): + next_level = self.page_levels + return PageHierarchyEntry(entry_va(next_level, self.next_entry_physical_address, va), next_level) + + def mk_string(self): + return f"""\ +cr3: + {'cr3 binary data': <30} {hex(self.cr3)} + {'next entry physical address': <30} {hex(self.next_entry_physical_address)} + --- + {'bit' : <4} {self.page_level_write_through[0]: <10} {'page level write through': <30} {self.page_level_write_through[1]} + {'bit' : <4} {self.page_level_cache_disabled[0]: <10} {'page level cache disabled': <30} {self.page_level_cache_disabled[1]} +""" + + +class PageHierarchyEntry(): + def __init__(self, address, level): + data = int.from_bytes( + memoryview(gdb.selected_inferior().read_memory(address, 8)), + "little" + ) + if level == 1: + self.is_page = True + self.entry_present = is_bit_defined_tupled(data, 0) + self.read_write = is_bit_defined_tupled(data, 1) + self.user_access_allowed = is_bit_defined_tupled(data, 2) + self.page_level_write_through = is_bit_defined_tupled(data, 3) + self.page_level_cache_disabled = is_bit_defined_tupled(data, 4) + self.entry_was_accessed = is_bit_defined_tupled(data, 5) + self.dirty = is_bit_defined_tupled(data, 6) + self.pat = is_bit_defined_tupled(data, 7) + self.global_translation = is_bit_defined_tupled(data, 8) + self.page_physical_address = data & PHYSICAL_ADDRESS_MASK & page_mask(level) + self.next_entry_physical_address = None + self.hlat_restart_with_ordinary = is_bit_defined_tupled(data, 11) + self.protection_key = content_tupled(data, 59, 62) + self.executed_disable = is_bit_defined_tupled(data, 63) + else: + page_size = is_bit_defined_tupled(data, 7) + page_size_bit = page_size[1] + self.is_page = page_size_bit + self.entry_present = is_bit_defined_tupled(data, 0) + self.read_write = is_bit_defined_tupled(data, 1) + self.user_access_allowed = is_bit_defined_tupled(data, 2) + self.page_level_write_through = is_bit_defined_tupled(data, 3) + self.page_level_cache_disabled = is_bit_defined_tupled(data, 4) + self.entry_was_accessed = is_bit_defined_tupled(data, 5) + self.page_size = page_size + self.dirty = is_bit_defined_tupled( + data, 6) if page_size_bit else None + self.global_translation = is_bit_defined_tupled( + data, 8) if page_size_bit else None + self.pat = is_bit_defined_tupled( + data, 12) if page_size_bit else None + self.page_physical_address = data & PHYSICAL_ADDRESS_MASK & page_mask(level) if page_size_bit else None + self.next_entry_physical_address = None if page_size_bit else data & PHYSICAL_ADDRESS_MASK & page_mask() + self.hlat_restart_with_ordinary = is_bit_defined_tupled(data, 11) + self.protection_key = content_tupled(data, 59, 62) if page_size_bit else None + self.executed_disable = is_bit_defined_tupled(data, 63) + self.address = address + self.page_entry_binary_data = data + self.page_hierarchy_level = level + + def next_entry(self, va): + if self.is_page or not self.entry_present[1]: + return None + + next_level = self.page_hierarchy_level - 1 + return PageHierarchyEntry(entry_va(next_level, self.next_entry_physical_address, va), next_level) + + + def mk_string(self): + if not self.entry_present[1]: + return f"""\ +level {self.page_hierarchy_level}: + {'entry address': <30} {hex(self.address)} + {'page entry binary data': <30} {hex(self.page_entry_binary_data)} + --- + PAGE ENTRY IS NOT PRESENT! +""" + elif self.is_page: + def page_size_line(ps_bit, ps, level): + return "" if level == 1 else f"{'bit': <3} {ps_bit: <5} {'page size': <30} {ps}" + + return f"""\ +level {self.page_hierarchy_level}: + {'entry address': <30} {hex(self.address)} + {'page entry binary data': <30} {hex(self.page_entry_binary_data)} + {'page size': <30} {'1GB' if self.page_hierarchy_level == 3 else '2MB' if self.page_hierarchy_level == 2 else '4KB' if self.page_hierarchy_level == 1 else 'Unknown page size for level:' + self.page_hierarchy_level} + {'page physical address': <30} {hex(self.page_physical_address)} + --- + {'bit': <4} {self.entry_present[0]: <10} {'entry present': <30} {self.entry_present[1]} + {'bit': <4} {self.read_write[0]: <10} {'read/write access allowed': <30} {self.read_write[1]} + {'bit': <4} {self.user_access_allowed[0]: <10} {'user access allowed': <30} {self.user_access_allowed[1]} + {'bit': <4} {self.page_level_write_through[0]: <10} {'page level write through': <30} {self.page_level_write_through[1]} + {'bit': <4} {self.page_level_cache_disabled[0]: <10} {'page level cache disabled': <30} {self.page_level_cache_disabled[1]} + {'bit': <4} {self.entry_was_accessed[0]: <10} {'entry has been accessed': <30} {self.entry_was_accessed[1]} + {"" if self.page_hierarchy_level == 1 else f"{'bit': <4} {self.page_size[0]: <10} {'page size': <30} {self.page_size[1]}"} + {'bit': <4} {self.dirty[0]: <10} {'page dirty': <30} {self.dirty[1]} + {'bit': <4} {self.global_translation[0]: <10} {'global translation': <30} {self.global_translation[1]} + {'bit': <4} {self.hlat_restart_with_ordinary[0]: <10} {'restart to ordinary': <30} {self.hlat_restart_with_ordinary[1]} + {'bit': <4} {self.pat[0]: <10} {'pat': <30} {self.pat[1]} + {'bits': <4} {str(self.protection_key[0]): <10} {'protection key': <30} {self.protection_key[1]} + {'bit': <4} {self.executed_disable[0]: <10} {'execute disable': <30} {self.executed_disable[1]} +""" + else: + return f"""\ +level {self.page_hierarchy_level}: + {'entry address': <30} {hex(self.address)} + {'page entry binary data': <30} {hex(self.page_entry_binary_data)} + {'next entry physical address': <30} {hex(self.next_entry_physical_address)} + --- + {'bit': <4} {self.entry_present[0]: <10} {'entry present': <30} {self.entry_present[1]} + {'bit': <4} {self.read_write[0]: <10} {'read/write access allowed': <30} {self.read_write[1]} + {'bit': <4} {self.user_access_allowed[0]: <10} {'user access allowed': <30} {self.user_access_allowed[1]} + {'bit': <4} {self.page_level_write_through[0]: <10} {'page level write through': <30} {self.page_level_write_through[1]} + {'bit': <4} {self.page_level_cache_disabled[0]: <10} {'page level cache disabled': <30} {self.page_level_cache_disabled[1]} + {'bit': <4} {self.entry_was_accessed[0]: <10} {'entry has been accessed': <30} {self.entry_was_accessed[1]} + {'bit': <4} {self.page_size[0]: <10} {'page size': <30} {self.page_size[1]} + {'bit': <4} {self.hlat_restart_with_ordinary[0]: <10} {'restart to ordinary': <30} {self.hlat_restart_with_ordinary[1]} + {'bit': <4} {self.executed_disable[0]: <10} {'execute disable': <30} {self.executed_disable[1]} +""" + + +class TranslateVM(gdb.Command): + """Prints the entire paging structure used to translate a given virtual address. + +Having an address space of the currently executed process translates the virtual address +and prints detailed information of all paging structure levels used for the transaltion. +Currently supported arch: x86""" + + def __init__(self): + super(TranslateVM, self).__init__('translate-vm', gdb.COMMAND_USER) + + def invoke(self, arg, from_tty): + if utils.is_target_arch("x86"): + vm_address = gdb.parse_and_eval(f'{arg}') + cr3_data = gdb.parse_and_eval('$cr3') + cr4 = gdb.parse_and_eval('$cr4') + page_levels = 5 if cr4 & (1 << 12) else 4 + page_entry = Cr3(cr3_data, page_levels) + while page_entry: + gdb.write(page_entry.mk_string()) + page_entry = page_entry.next_entry(vm_address) + else: + gdb.GdbError("Virtual address translation is not" + "supported for this arch") + + +TranslateVM() diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py index 3e8d3669f0ce0..3a5b44cd6bfe4 100644 --- a/scripts/gdb/vmlinux-gdb.py +++ b/scripts/gdb/vmlinux-gdb.py @@ -37,3 +37,4 @@ import linux.clk import linux.genpd import linux.device + import linux.mm diff --git a/scripts/spelling.txt b/scripts/spelling.txt index ded8bcfc02475..f8bd6178d17b0 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -65,6 +65,7 @@ acumulative||accumulative acumulator||accumulator acutally||actually adapater||adapter +adderted||asserted addional||additional additionaly||additionally additonal||additional @@ -122,6 +123,7 @@ alue||value ambigious||ambiguous ambigous||ambiguous amoung||among +amount of times||number of times amout||amount amplifer||amplifier amplifyer||amplifier @@ -287,6 +289,7 @@ capapbilities||capabilities caputure||capture carefuly||carefully cariage||carriage +casued||caused catagory||category cehck||check challange||challenge @@ -370,6 +373,7 @@ conbination||combination conditionaly||conditionally conditon||condition condtion||condition +condtional||conditional conected||connected conector||connector configration||configuration @@ -423,6 +427,7 @@ cound||could couter||counter coutner||counter cryptocraphic||cryptographic +cummulative||cumulative cunter||counter curently||currently cylic||cyclic @@ -625,8 +630,10 @@ exeuction||execution existance||existence existant||existent exixt||exist +exsits||exists exlcude||exclude exlcusive||exclusive +exlusive||exclusive exmaple||example expecially||especially experies||expires @@ -664,11 +671,13 @@ feauture||feature feautures||features fetaure||feature fetaures||features +fetcing||fetching fileystem||filesystem fimrware||firmware fimware||firmware firmare||firmware firmaware||firmware +firtly||firstly firware||firmware firwmare||firmware finanize||finalize @@ -838,6 +847,7 @@ integrety||integrity integrey||integrity intendet||intended intented||intended +interal||internal interanl||internal interchangable||interchangeable interferring||interfering @@ -1023,6 +1033,7 @@ negotation||negotiation nerver||never nescessary||necessary nessessary||necessary +none existent||non-existent noticable||noticeable notication||notification notications||notifications @@ -1044,6 +1055,7 @@ occured||occurred occurence||occurrence occure||occurred occuring||occurring +ocurrence||occurrence offser||offset offet||offset offlaod||offload @@ -1055,6 +1067,7 @@ omitt||omit ommiting||omitting ommitted||omitted onself||oneself +onthe||on the ony||only openning||opening operatione||operation @@ -1121,6 +1134,7 @@ perfomring||performing periperal||peripheral peripherial||peripheral permissons||permissions +permited||permitted peroid||period persistance||persistence persistant||persistent @@ -1334,6 +1348,7 @@ sacrifying||sacrificing safly||safely safty||safety satify||satisfy +satisifed||satisfied savable||saveable scaleing||scaling scaned||scanned @@ -1558,6 +1573,7 @@ tunning||tuning ture||true tyep||type udpate||update +updtes||updates uesd||used unknwon||unknown uknown||unknown @@ -1614,6 +1630,7 @@ unuseful||useless unvalid||invalid upate||update upsupported||unsupported +upto||up to useable||usable usefule||useful usefull||useful diff --git a/scripts/tags.sh b/scripts/tags.sh index 14d0209893e9b..da4d404cd1f09 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -271,10 +271,12 @@ exuberant() --$CTAGS_EXTRA=+fq --c-kinds=+px --fields=+iaS --langmap=c:+.h \ "${regex[@]}" - setup_regex exuberant kconfig - all_kconfigs | xargs $1 -a \ - --langdef=kconfig --language-force=kconfig "${regex[@]}" - + KCONFIG_ARGS=() + if ! $1 --list-languages | grep -iq kconfig; then + setup_regex exuberant kconfig + KCONFIG_ARGS=(--langdef=kconfig --language-force=kconfig "${regex[@]}") + fi + all_kconfigs | xargs $1 -a "${KCONFIG_ARGS[@]}" } emacs() diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 0a6894cdc54d9..18498979a6409 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -262,7 +262,7 @@ static int sel_mmap_handle_status(struct file *filp, if (vma->vm_flags & VM_WRITE) return -EPERM; /* disallow mprotect() turns it into writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); return remap_pfn_range(vma, vma->vm_start, page_to_pfn(status), @@ -506,13 +506,13 @@ static int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) { /* do not allow mprotect to make mapping writable */ - vma->vm_flags &= ~VM_MAYWRITE; + vm_flags_clear(vma, VM_MAYWRITE); if (vma->vm_flags & VM_WRITE) return -EACCES; } - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &sel_mmap_policy_ops; return 0; diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ac2efeb63a396..728c211142d14 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -2910,7 +2910,7 @@ static int snd_pcm_oss_mmap(struct file *file, struct vm_area_struct *area) } /* set VM_READ access as well to fix memset() routines that do reads before writes (to improve performance) */ - area->vm_flags |= VM_READ; + vm_flags_set(area, VM_READ); if (substream == NULL) return -ENXIO; runtime = substream->runtime; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9c122e757efe8..331380c2438bc 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3675,8 +3675,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; - area->vm_flags &= ~(VM_WRITE | VM_MAYWRITE); + vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP, + VM_WRITE | VM_MAYWRITE); + return 0; } @@ -3712,7 +3713,7 @@ static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -3828,7 +3829,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); if (!substream->ops->page && !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area)) return 0; diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c index 8099a829c304b..cdfca9fd1eb00 100644 --- a/sound/soc/fsl/fsl-asoc-card.c +++ b/sound/soc/fsl/fsl-asoc-card.c @@ -811,7 +811,7 @@ static int fsl_asoc_card_probe(struct platform_device *pdev) priv->card.num_links = 1; if (asrc_pdev) { - /* DPCM DAI Links only if ASRC exsits */ + /* DPCM DAI Links only if ASRC exists */ priv->dai_link[1].cpus->of_node = asrc_np; priv->dai_link[1].platforms->of_node = asrc_np; priv->dai_link[2].codecs->dai_name = codec_dai_name; diff --git a/sound/soc/pxa/mmp-sspa.c b/sound/soc/pxa/mmp-sspa.c index fb5a4390443fe..b3c1744eff91a 100644 --- a/sound/soc/pxa/mmp-sspa.c +++ b/sound/soc/pxa/mmp-sspa.c @@ -404,7 +404,7 @@ static int mmp_pcm_mmap(struct snd_soc_component *component, struct snd_pcm_substream *substream, struct vm_area_struct *vma) { - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); return remap_pfn_range(vma, vma->vm_start, substream->dma_buffer.addr >> PAGE_SHIFT, diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index e558931cce16e..709ccad972e2f 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -224,9 +224,9 @@ static int usb_stream_hwdep_mmap(struct snd_hwdep *hw, } area->vm_ops = &usb_stream_hwdep_vm_ops; - area->vm_flags |= VM_DONTDUMP; + vm_flags_set(area, VM_DONTDUMP); if (!read) - area->vm_flags |= VM_DONTEXPAND; + vm_flags_set(area, VM_DONTEXPAND); area->vm_private_data = us122l; atomic_inc(&us122l->mmap_count); out: diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index c29da0341bc5b..4937ede0b5d70 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -61,7 +61,7 @@ static int snd_us428ctls_mmap(struct snd_hwdep *hw, struct file *filp, struct vm } area->vm_ops = &us428ctls_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 767a227d54da4..36f2e31168fb0 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -706,7 +706,7 @@ static int snd_usx2y_hwdep_pcm_mmap(struct snd_hwdep *hw, struct file *filp, str return -ENODEV; area->vm_ops = &snd_usx2y_hwdep_pcm_vm_ops; - area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } diff --git a/tools/mm/Makefile b/tools/mm/Makefile index 9860622cbb151..6c1da51f4177c 100644 --- a/tools/mm/Makefile +++ b/tools/mm/Makefile @@ -8,8 +8,8 @@ TARGETS=page-types slabinfo page_owner_sort LIB_DIR = ../lib/api LIBS = $(LIB_DIR)/libapi.a -CFLAGS = -Wall -Wextra -I../lib/ -LDFLAGS = $(LIBS) +CFLAGS += -Wall -Wextra -I../lib/ +LDFLAGS += $(LIBS) all: $(TARGETS) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 8ae0a15352936..f3029742b800f 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -653,7 +653,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 1f36bc1c5d362..958ee9bdb3166 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35342,7 +35342,7 @@ static noinline void check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35351,18 +35351,18 @@ static noinline void check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35370,25 +35370,25 @@ static noinline void check_prealloc(struct maple_tree *mt) mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35397,12 +35397,12 @@ static noinline void check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35410,21 +35410,21 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35432,14 +35432,14 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35447,7 +35447,7 @@ static noinline void check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh index 48b7af6b022cb..f3ffeb1343cf2 100644 --- a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -10,7 +10,7 @@ dmesg -C for file in "$DBGFS/"* do - echo "$(basename "$f")" > "$DBGFS/rm_contexts" + (echo "$(basename "$f")" > "$DBGFS/rm_contexts") &> /dev/null if dmesg | grep -q BUG then dmesg diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index a00336ffdcad4..bcd4734ca0943 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -24,7 +24,7 @@ ensure_write_fail() content=$2 reason=$3 - if echo "$content" > "$file" + if (echo "$content" > "$file") 2> /dev/null then echo "writing $content to $file succeed ($fail_reason)" echo "expected failure because $reason" diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 402e434ecc61c..c31d952cff68f 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -60,6 +60,7 @@ TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_PROGS += ksm_functional_tests +TEST_GEN_PROGS += mdwe_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d8b5b49304121..6acbd08e73e7b 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -27,6 +27,8 @@ static int ksm_fd; static int ksm_full_scans_fd; +static int ksm_zero_pages_fd; +static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -57,6 +59,22 @@ static bool range_maps_duplicates(char *addr, unsigned long size) return false; } +static bool check_ksm_zero_pages_count(unsigned long zero_size) +{ + unsigned long pages_expected = zero_size / (4 * KiB); + char buf[20]; + ssize_t read_size; + unsigned long ksm_zero_pages; + + read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + buf[read_size] = 0; + ksm_zero_pages = strtol(buf, NULL, 10); + + return ksm_zero_pages == pages_expected; +} + static long ksm_get_full_scans(void) { char buf[10]; @@ -70,15 +88,12 @@ static long ksm_get_full_scans(void) return strtol(buf, NULL, 10); } -static int ksm_merge(void) +static int wait_two_full_scans(void) { long start_scans, end_scans; - /* Wait for two full scans such that any possible merging happened. */ start_scans = ksm_get_full_scans(); if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) return -errno; do { end_scans = ksm_get_full_scans(); @@ -89,6 +104,34 @@ static int ksm_merge(void) return 0; } +static inline int ksm_merge(void) +{ + /* Wait for two full scans such that any possible merging happened. */ + if (write(ksm_fd, "1", 1) != 1) + return -errno; + return wait_two_full_scans(); +} + +static inline int make_cow(char *map, char val, unsigned long size) +{ + + memset(map, val, size); + return wait_two_full_scans(); +} + +static int unmerge_zero_page(char *start, unsigned long size) +{ + int ret; + + ret = madvise(start, size, MADV_UNMERGEABLE); + if (ret) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + return ret; + } + + return wait_two_full_scans(); +} + static char *mmap_and_merge_range(char val, unsigned long size) { char *map; @@ -146,6 +189,56 @@ static void test_unmerge(void) munmap(map, size); } +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + /* Confirm the interfaces*/ + ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); + if (ksm_zero_pages_fd < 0) { + ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); + return; + } + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Mmap zero pages*/ + map = mmap_and_merge_range(0x00, size); + + /* Case 1: make Writing on ksm zero pages (COW) */ + if (make_cow(map, 0xcf, size / 2)) { + ksft_test_result_fail("COW failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 2), + "zero page count react to cow\n"); + + /* Case 2: Call madvise(xxx, MADV_UNMERGEABLE)*/ + if (unmerge_zero_page(map + size / 2, size / 4)) { + ksft_test_result_fail("unmerge_zero_page failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 4), + "zero page count react to unmerge\n"); + + /*Check if ksm pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map + size / 2, size / 4), + "KSM zero pages were unmerged\n"); + +unmap: + munmap(map, size); +} + static void test_unmerge_discarded(void) { const unsigned int size = 2 * MiB; @@ -261,11 +354,13 @@ int main(int argc, char **argv) ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); if (ksm_full_scans_fd < 0) ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); test_unmerge(); + test_unmerge_zero_pages(); test_unmerge_discarded(); #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c new file mode 100644 index 0000000000000..f466a099f1bf7 --- /dev/null +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifdef __aarch64__ +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +#ifndef __aarch64__ +# define PROT_BTI 0 +#endif + +TEST(prctl_flags) +{ + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + + EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); +} + +FIXTURE(mdwe) +{ + void *p; + int flags; + size_t size; + pid_t pid; +}; + +FIXTURE_VARIANT(mdwe) +{ + bool enabled; + bool forked; +}; + +FIXTURE_VARIANT_ADD(mdwe, stock) +{ + .enabled = false, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, enabled) +{ + .enabled = true, + .forked = false, +}; + +FIXTURE_VARIANT_ADD(mdwe, forked) +{ + .enabled = true, + .forked = true, +}; + +FIXTURE_SETUP(mdwe) +{ + int ret, status; + + self->p = NULL; + self->flags = MAP_SHARED | MAP_ANONYMOUS; + self->size = getpagesize(); + + if (!variant->enabled) + return; + + ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L); + ASSERT_EQ(ret, 0) { + TH_LOG("PR_SET_MDWE failed or unsupported"); + } + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, 1); + + if (variant->forked) { + self->pid = fork(); + ASSERT_GE(self->pid, 0) { + TH_LOG("fork failed\n"); + } + + if (self->pid > 0) { + ret = waitpid(self->pid, &status, 0); + ASSERT_TRUE(WIFEXITED(status)); + exit(WEXITSTATUS(status)); + } + } +} + +FIXTURE_TEARDOWN(mdwe) +{ + if (self->p && self->p != MAP_FAILED) + munmap(self->p, self->size); +} + +TEST_F(mdwe, mmap_READ_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + EXPECT_NE(self->p, MAP_FAILED); +} + +TEST_F(mdwe, mmap_WRITE_EXEC) +{ + self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); + if (variant->enabled) { + EXPECT_EQ(self->p, MAP_FAILED); + } else { + EXPECT_NE(self->p, MAP_FAILED); + } +} + +TEST_F(mdwe, mprotect_stay_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + EXPECT_EQ(ret, 0); +} + +TEST_F(mdwe, mprotect_add_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mprotect_WRITE_EXEC) +{ + int ret; + + self->p = mmap(NULL, self->size, PROT_WRITE, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); + if (variant->enabled) { + EXPECT_LT(ret, 0); + } else { + EXPECT_EQ(ret, 0); + } +} + +TEST_F(mdwe, mmap_FIXED) +{ + void *p, *p2; + + p2 = mmap(NULL, self->size, PROT_READ | PROT_EXEC, self->flags, 0, 0); + self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC, + self->flags | MAP_FIXED, 0, 0); + if (variant->enabled) { + EXPECT_EQ(p, MAP_FAILED); + } else { + EXPECT_EQ(p, self->p); + } +} + +TEST_F(mdwe, arm64_BTI) +{ + int ret; + +#ifdef __aarch64__ + if (!(getauxval(AT_HWCAP2) & HWCAP2_BTI)) +#endif + SKIP(return, "HWCAP2_BTI not supported"); + + self->p = mmap(NULL, self->size, PROT_EXEC, self->flags, 0, 0); + ASSERT_NE(self->p, MAP_FAILED); + + ret = mprotect(self->p, self->size, PROT_EXEC | PROT_BTI); + EXPECT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN