From 3709d67ce30699cb0f0af52fbf022573d6ba3204 Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 13 Jul 2022 14:20:09 +0800 Subject: [PATCH 001/352] mm/compaction: fix set skip in fast_find_migrateblock When we successfully find a pageblock in fast_find_migrateblock(), the block will be set skip-flag through set_pageblock_skip(). However, when entering isolate_migratepages_block(), the whole pageblock will be skipped due to the branch 'if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages))'. Eventually we will goto isolate_abort and isolate nothing. That makes fast_find_migrateblock useless. In this patch, when we find a suitable pageblock in fast_find_migrateblock, we do noting but let isolate_migratepages_block to set skip flag to the pageblock after scan it. Normally, we would isolate some pages from the fast-find block. I use mmtest/thpscale-madvhugepage test it. Here is the result: baseline patch Amean fault-both-1 1331.66 ( 0.00%) 1261.04 * 5.30%* Amean fault-both-3 1383.95 ( 0.00%) 1191.69 * 13.89%* Amean fault-both-5 1568.13 ( 0.00%) 1445.20 * 7.84%* Amean fault-both-7 1819.62 ( 0.00%) 1555.13 * 14.54%* Amean fault-both-12 1106.96 ( 0.00%) 1149.43 * -3.84%* Amean fault-both-18 2196.93 ( 0.00%) 1875.77 * 14.62%* Amean fault-both-24 2642.69 ( 0.00%) 2671.21 * -1.08%* Amean fault-both-30 2901.89 ( 0.00%) 2857.32 * 1.54%* Amean fault-both-32 3747.00 ( 0.00%) 3479.23 * 7.15%* Link: https://lkml.kernel.org/r/20220713062009.597255-1-zhouchuyi@bytedance.com Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: zhouchuyi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 262c4676b32c1..f72907c7cfefd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1853,7 +1853,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } From 6708fe6bec50261a49d2ed770c08d0ef2c0469e3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 26 Sep 2022 19:58:18 -0700 Subject: [PATCH 002/352] mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - causes __GFP_HIGH to set ALLOC_HARDER unless __GFP_NOMEMALLOC is set (as well as ALLOC_HIGH). - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Signed-off-by: NeilBrown Reviewed-by: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thierry Reding Cc: Mel Gorman Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp_types.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 16 ++++------------ tools/perf/builtin-kmem.c | 1 - 8 files changed, 16 insertions(+), 30 deletions(-) diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e1735..e38e9d83c1c72 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 2a8de975fe63b..555ae07ce0277 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index d88c46ca82e17..5088637fe5c2d 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -31,7 +31,7 @@ typedef unsigned int __bitwise gfp_t; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -116,11 +116,8 @@ typedef unsigned int __bitwise gfp_t; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -135,7 +132,6 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -329,7 +325,7 @@ typedef unsigned int __bitwise gfp_t; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3c..11524cda4a955 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index 4bd15a593fbd9..fe13de1bed5fe 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -686,17 +686,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index 0f106a3982e73..971e09ddf3518 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44f3c93643161..e65a9ca06f3b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4058,12 +4058,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4822,12 +4822,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -5021,14 +5021,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int zonelist_iter_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - restart: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index ebfab2ca17024..4a06d83f2ac5a 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, From bcae60aef61850012701e3bba0ebff1c34d9829c Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:37 -0700 Subject: [PATCH 003/352] mm: change release_pages() to use unsigned long for npages Patch series "convert most filesystems to pin_user_pages_fast()", v2. This converts the iomap core and bio_release_pages() to pin_user_pages_fast(), also referred to as FOLL_PIN here. The conversion is temporarily guarded by CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO. In the future (not part of this series), when we are certain that all filesystems have converted their Direct IO paths to FOLL_PIN, then we can do the final step, which is to get rid of CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO and search-and-replace the dio_w_*() functions with their final names (see bvec.h changes). I'd like to get this part committed at some point, because it seems to work well already. And this will help get the remaining items, below, converted. Status: although many filesystems have been converted, some remain to be investigated. These include (you can recreate this list by grepping for iov_iter_get_pages): cephfs cifs 9P RDS net/core: datagram.c, skmsg.c net/tls fs/splice.c This patch (of 7): The various callers of release_pages() are passing in either various types (signed or unsigned) and lengths (int or long) of integers, for the second argument (number of pages). To make this conversion accurate and to avoid having to check for overflow (or deal with type conversion warnings), let's just change release_pages() to accept an unsigned long for the number of pages. Also change the name of the argument, from "nr" to "npages", for clarity, as long as that line is being changed anyway. Link: https://lkml.kernel.org/r/20220831041843.973026-1-jhubbard@nvidia.com Link: https://lkml.kernel.org/r/20220831041843.973026-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/swap.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e56dd8f7eae19..b06a3de67eed4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1177,7 +1177,7 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } -void release_pages(struct page **pages, int nr); +void release_pages(struct page **pages, unsigned long npages); /** * folios_put - Decrement the reference count on an array of folios. diff --git a/mm/swap.c b/mm/swap.c index 0a3871a70952f..f1e89ca53faf9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -975,15 +975,15 @@ void lru_cache_disable(void) * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. */ -void release_pages(struct page **pages, int nr) +void release_pages(struct page **pages, unsigned long npages) { - int i; + unsigned long i; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; - for (i = 0; i < nr; i++) { + for (i = 0; i < npages; i++) { struct folio *folio = page_folio(pages[i]); /* From d5f1d5bf49955e1cbc2692dc63b05a092a2d5be9 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:38 -0700 Subject: [PATCH 004/352] mm/gup: introduce pin_user_page() pin_user_page() is an externally-usable version of try_grab_page(), but with semantics that match get_page(), so that it can act as a drop-in replacement for get_page(). Specifically, pin_user_page() has a void return type. pin_user_page() elevates a page's refcount using FOLL_PIN rules. This means that the caller must release the page via unpin_user_page(). Link: https://lkml.kernel.org/r/20220831041843.973026-3-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/gup.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index b06a3de67eed4..bb0426faeb32f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1943,6 +1943,7 @@ long pin_user_pages_remote(struct mm_struct *mm, long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); +void pin_user_page(struct page *page); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/mm/gup.c b/mm/gup.c index fe195d47de74a..bfded683e64e0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -3281,6 +3281,56 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(pin_user_pages); +/** + * pin_user_page() - apply a FOLL_PIN reference to a file-backed page that the + * caller already owns. + * + * @page: the page to be pinned. + * + * pin_user_page() elevates a page's refcount using FOLL_PIN rules. This means + * that the caller must release the page via unpin_user_page(). + * + * pin_user_page() is intended as a drop-in replacement for get_page(). This + * provides a way for callers to do a subsequent unpin_user_page() on the + * affected page. However, it is only intended for use by callers (file systems, + * block/bio) that have a file-backed page. Anonymous pages are not expected nor + * supported, and will generate a warning. + * + * pin_user_page() may also be thought of as an externally-usable version of + * try_grab_page(), but with semantics that match get_page(), so that it can act + * as a drop-in replacement for get_page(). + * + * IMPORTANT: The caller must release the page via unpin_user_page(). + * + */ +void pin_user_page(struct page *page) +{ + struct folio *folio = page_folio(page); + + WARN_ON_ONCE(folio_ref_count(folio) <= 0); + + /* + * This function is only intended for file-backed callers, who already + * have a page reference. + */ + WARN_ON_ONCE(PageAnon(page)); + + /* + * Similar to try_grab_page(): be sure to *also* + * increment the normal page refcount field at least once, + * so that the page really is pinned. + */ + if (folio_test_large(folio)) { + folio_ref_add(folio, 1); + atomic_add(1, folio_pincount_ptr(folio)); + } else { + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + } + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); +} +EXPORT_SYMBOL(pin_user_page); + /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets From 1300e4984255396c1bfc81b731db9147280222c3 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:39 -0700 Subject: [PATCH 005/352] block: add dio_w_*() wrappers for pin, unpin user pages Background: The Direct IO part of the block infrastructure is being changed to use pin_user_page*() and unpin_user_page*() calls, in place of a mix of get_user_pages_fast(), get_page(), and put_page(). These have to be changed over all at the same time, for block, bio, and all filesystems. However, most filesystems can be changed via iomap and core filesystem routines, so let's get that in place, and then continue on with converting the remaining filesystems (9P, CIFS) and anything else that feeds pages into bio that ultimately get released via bio_release_pages(). Add a new config parameter, CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO, and dio_w_*() wrapper functions. The dio_w_ prefix was chosen for uniqueness, so as to ease a subsequent kernel-wide rename via search-and-replace. Together, these allow the developer to choose between these sets of routines, for Direct IO code paths: a) pin_user_pages_fast() pin_user_page() unpin_user_page() b) get_user_pages_fast() get_page() put_page() CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO is a temporary setting, and may be deleted once the conversion is complete. In the meantime, developers can enable this in order to try out each filesystem. Please remember that these /proc/vmstat items (below) should normally contain the same values as each other, except during the middle of pin/unpin operations. As such, they can be helpful when monitoring test runs: nr_foll_pin_acquired nr_foll_pin_released Link: https://lkml.kernel.org/r/20220831041843.973026-4-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- block/Kconfig | 24 ++++++++++++++++++++++++ include/linux/bvec.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/block/Kconfig b/block/Kconfig index 444c5ab3b67e2..d4fdd606d1387 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -48,6 +48,30 @@ config BLK_DEV_BSG_COMMON config BLK_ICQ bool +config BLK_USE_PIN_USER_PAGES_FOR_DIO + bool "DEVELOPERS ONLY: Enable pin_user_pages() for Direct IO" if EXPERT + default n + + help + For Direct IO code, retain the pages via calls to + pin_user_pages_fast(), instead of via get_user_pages_fast(). + Likewise, use pin_user_page() instead of get_page(). And then + release such pages via unpin_user_page(), instead of + put_page(). + + This is a temporary setting, which will be deleted once the + conversion is completed, reviewed, and tested. In the meantime, + developers can enable this in order to try out each filesystem. + For that, it's best to monitor these /proc/vmstat items: + + nr_foll_pin_acquired + nr_foll_pin_released + + ...to ensure that they remain equal, when "at rest". + + Say yes here ONLY if are actively developing or testing the + block layer or filesystems with pin_user_pages_fast(). + config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" select BLK_DEV_BSG_COMMON diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 35c25dff651a5..952d869702d20 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -241,4 +241,41 @@ static inline void *bvec_virt(struct bio_vec *bvec) return page_address(bvec->bv_page) + bvec->bv_offset; } +#ifdef CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO +#define dio_w_pin_user_pages_fast(s, n, p, f) pin_user_pages_fast(s, n, p, f) +#define dio_w_pin_user_page(p) pin_user_page(p) +#define dio_w_iov_iter_pin_pages(i, p, m, n, s) iov_iter_pin_pages(i, p, m, n, s) +#define dio_w_iov_iter_pin_pages_alloc(i, p, m, s) iov_iter_pin_pages_alloc(i, p, m, s) +#define dio_w_unpin_user_page(p) unpin_user_page(p) +#define dio_w_unpin_user_pages(p, n) unpin_user_pages(p, n) +#define dio_w_unpin_user_pages_dirty_lock(p, n, d) unpin_user_pages_dirty_lock(p, n, d) + +#else +#define dio_w_pin_user_pages_fast(s, n, p, f) get_user_pages_fast(s, n, p, f) +#define dio_w_pin_user_page(p) get_page(p) +#define dio_w_iov_iter_pin_pages(i, p, m, n, s) iov_iter_get_pages2(i, p, m, n, s) +#define dio_w_iov_iter_pin_pages_alloc(i, p, m, s) iov_iter_get_pages_alloc2(i, p, m, s) +#define dio_w_unpin_user_page(p) put_page(p) + +static inline void dio_w_unpin_user_pages(struct page **pages, + unsigned long npages) +{ + release_pages(pages, npages); +} + +static inline void dio_w_unpin_user_pages_dirty_lock(struct page **pages, + unsigned long npages, + bool make_dirty) +{ + unsigned long i; + + for (i = 0; i < npages; i++) { + if (make_dirty) + set_page_dirty_lock(pages[i]); + put_page(pages[i]); + } +} + +#endif + #endif /* __LINUX_BVEC_H */ From 652590d3a5d3cefc9b8c8020ddef3bf1663a2b4c Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:40 -0700 Subject: [PATCH 006/352] iov_iter: new iov_iter_pin_pages*() routines Provide two new wrapper routines that are intended for user space pages only: iov_iter_pin_pages() iov_iter_pin_pages_alloc() Internally, these routines call pin_user_pages_fast(), instead of get_user_pages_fast(), for user_backed_iter(i) and iov_iter_bvec(i) cases. As always, callers must use unpin_user_pages() or a suitable FOLL_PIN variant, to release the pages, if they actually were acquired via pin_user_pages_fast(). This is a prerequisite to converting bio/block layers over to use pin_user_pages_fast(). Link: https://lkml.kernel.org/r/20220831041843.973026-5-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- include/linux/uio.h | 4 +++ lib/iov_iter.c | 86 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 5896af36199cb..e26908e443d1f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -251,6 +251,10 @@ ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); +ssize_t iov_iter_pin_pages(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned int maxpages, size_t *start); +ssize_t iov_iter_pin_pages_alloc(struct iov_iter *i, struct page ***pages, + size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4b7fce72e3e52..c63ce0eadfcb6 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1425,9 +1425,31 @@ static struct page *first_bvec_segment(const struct iov_iter *i, return page; } +enum pages_alloc_internal_flags { + USE_FOLL_GET, + MAYBE_USE_FOLL_PIN +}; + +/* + * Pins pages, either via get_page(), or via pin_user_page*(). The caller is + * responsible for tracking which pinning mechanism was used here, and releasing + * pages via the appropriate call: put_page() or unpin_user_page(). + * + * The way to figure that out is: + * + * a) If how_to_pin == FOLL_GET, then this routine will always pin via + * get_page(). + * + * b) If how_to_pin == MAYBE_USE_FOLL_PIN, then this routine will pin via + * pin_user_page*() for either user_backed_iter(i) cases, or + * iov_iter_is_bvec(i) cases. However, for the other cases (pipe, + * xarray), pages will be pinned via get_page(). + */ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - unsigned int maxpages, size_t *start) + unsigned int maxpages, size_t *start, + enum pages_alloc_internal_flags how_to_pin) + { unsigned int n; @@ -1454,7 +1476,12 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; - res = get_user_pages_fast(addr, n, gup_flags, *pages); + + if (how_to_pin == MAYBE_USE_FOLL_PIN) + res = pin_user_pages_fast(addr, n, gup_flags, *pages); + else + res = get_user_pages_fast(addr, n, gup_flags, *pages); + if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); @@ -1470,8 +1497,13 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, if (!n) return -ENOMEM; p = *pages; - for (int k = 0; k < n; k++) - get_page(p[k] = page + k); + for (int k = 0; k < n; k++) { + p[k] = page + k; + if (how_to_pin == MAYBE_USE_FOLL_PIN) + pin_user_page(p[k]); + else + get_page(p[k]); + } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; @@ -1497,10 +1529,29 @@ ssize_t iov_iter_get_pages2(struct iov_iter *i, return 0; BUG_ON(!pages); - return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); + return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start, + USE_FOLL_GET); } EXPORT_SYMBOL(iov_iter_get_pages2); +/* + * A FOLL_PIN variant that calls pin_user_pages_fast() instead of + * get_user_pages_fast(). + */ +ssize_t iov_iter_pin_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned int maxpages, + size_t *start) +{ + if (!maxpages) + return 0; + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + + return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start, + MAYBE_USE_FOLL_PIN); +} +EXPORT_SYMBOL(iov_iter_pin_pages); + ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -1509,7 +1560,8 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, *pages = NULL; - len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); + len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, + USE_FOLL_GET); if (len <= 0) { kvfree(*pages); *pages = NULL; @@ -1518,6 +1570,28 @@ ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); +/* + * A FOLL_PIN variant that calls pin_user_pages_fast() instead of + * get_user_pages_fast(). + */ +ssize_t iov_iter_pin_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + ssize_t len; + + *pages = NULL; + + len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, + MAYBE_USE_FOLL_PIN); + if (len <= 0) { + kvfree(*pages); + *pages = NULL; + } + return len; +} +EXPORT_SYMBOL(iov_iter_pin_pages_alloc); + size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { From d2e677f8212f52429793aa7e7320b7038cc9af72 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:41 -0700 Subject: [PATCH 007/352] block, bio, fs: convert most filesystems to pin_user_pages_fast() Use dio_w_*() wrapper calls, in place of get_user_pages_fast(), get_page() and put_page(). This converts the Direct IO parts of most filesystems over to using FOLL_PIN (pin_user_page*()) page pinning. Link: https://lkml.kernel.org/r/20220831041843.973026-6-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- block/bio.c | 27 ++++++++++++++------------- block/blk-map.c | 7 ++++--- fs/direct-io.c | 40 ++++++++++++++++++++-------------------- fs/iomap/direct-io.c | 2 +- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea25..6c6110f7054e1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1125,7 +1125,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + dio_w_unpin_user_page(bvec->bv_page); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1162,7 +1162,7 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, } if (same_page) - put_page(page); + dio_w_unpin_user_page(page); return 0; } @@ -1176,7 +1176,7 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - put_page(page); + dio_w_unpin_user_page(page); return 0; } @@ -1187,10 +1187,10 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * next non-empty segment of the iov iterator. + * Pins pages from *iter and appends them to @bio's bvec array. The pages will + * have to be released using dio_w_unpin_user_page when done. For multi-segment + * *iter, this function only adds pages from the next non-empty segment of the + * iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1218,8 +1218,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset); + size = dio_w_iov_iter_pin_pages(iter, pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1252,7 +1253,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) iov_iter_revert(iter, left); out: while (i < nr_pages) - put_page(pages[i++]); + dio_w_unpin_user_page(pages[i++]); return ret; } @@ -1444,9 +1445,9 @@ void bio_set_pages_dirty(struct bio *bio) * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * - * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * It is expected that bio_check_pages_dirty() will wholly own the BIO from here + * on. It will run one dio_w_unpin_user_page() against each page and will run + * one bio_put() against the BIO. */ static void bio_dirty_fn(struct work_struct *work); diff --git a/block/blk-map.c b/block/blk-map.c index 7196a6b64c802..4e333ad9776d2 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -254,7 +254,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, size_t offs, added = 0; int npages; - bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); + bytes = dio_w_iov_iter_pin_pages_alloc(iter, &pages, LONG_MAX, + &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -276,7 +277,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) { if (same_page) - put_page(page); + dio_w_unpin_user_page(page); break; } @@ -289,7 +290,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * release the pages we didn't map into the bio, if any */ while (j < npages) - put_page(pages[j++]); + dio_w_unpin_user_page(pages[j++]); kvfree(pages); /* couldn't stuff something into bio? */ if (bytes) { diff --git a/fs/direct-io.c b/fs/direct-io.c index f669163d5860f..05c044c55374e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -169,8 +169,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) const enum req_op dio_op = dio->opf & REQ_OP_MASK; ssize_t ret; - ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, - &sdio->from); + ret = dio_w_iov_iter_pin_pages(sdio->iter, dio->pages, LONG_MAX, + DIO_PAGES, &sdio->from); if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { struct page *page = ZERO_PAGE(0); @@ -181,7 +181,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - get_page(page); + dio_w_pin_user_page(page); dio->pages[0] = page; sdio->head = 0; sdio->tail = 1; @@ -197,7 +197,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; return 0; } - return ret; + return ret; } /* @@ -324,7 +324,7 @@ static void dio_aio_complete_work(struct work_struct *work) static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* - * Asynchronous IO callback. + * Asynchronous IO callback. */ static void dio_bio_end_aio(struct bio *bio) { @@ -449,7 +449,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { while (sdio->head < sdio->tail) - put_page(dio->pages[sdio->head++]); + dio_w_unpin_user_page(dio->pages[sdio->head++]); } /* @@ -716,7 +716,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) */ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) sdio->pages_in_io--; - get_page(sdio->cur_page); + dio_w_pin_user_page(sdio->cur_page); sdio->final_block_in_bio = sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits); ret = 0; @@ -725,7 +725,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) } return ret; } - + /* * Put cur_page under IO. The section of cur_page which is described by * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page @@ -787,7 +787,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, * An autonomous function to put a chunk of a page under deferred IO. * * The caller doesn't actually know (or care) whether this piece of page is in - * a BIO, or is under IO or whatever. We just take care of all possible + * a BIO, or is under IO or whatever. We just take care of all possible * situations here. The separation between the logic of do_direct_IO() and * that of submit_page_section() is important for clarity. Please don't break. * @@ -832,13 +832,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - put_page(sdio->cur_page); + dio_w_unpin_user_page(sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - get_page(page); /* It is in dio */ + dio_w_pin_user_page(page); /* It is in dio */ sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -853,7 +853,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); - put_page(sdio->cur_page); + dio_w_unpin_user_page(sdio->cur_page); sdio->cur_page = NULL; } return ret; @@ -890,7 +890,7 @@ static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, * We need to zero out part of an fs block. It is either at the * beginning or the end of the fs block. */ - if (end) + if (end) this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; this_chunk_bytes = this_chunk_blocks << sdio->blkbits; @@ -954,7 +954,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - put_page(page); + dio_w_unpin_user_page(page); goto out; } if (!buffer_mapped(map_bh)) @@ -999,7 +999,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, /* AKPM: eargh, -ENOTBLK is a hack */ if (dio_op == REQ_OP_WRITE) { - put_page(page); + dio_w_unpin_user_page(page); return -ENOTBLK; } @@ -1012,7 +1012,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - put_page(page); + dio_w_unpin_user_page(page); goto out; } zero_user(page, from, 1 << blkbits); @@ -1052,7 +1052,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, sdio->next_block_for_io, map_bh); if (ret) { - put_page(page); + dio_w_unpin_user_page(page); goto out; } sdio->next_block_for_io += this_chunk_blocks; @@ -1067,8 +1067,8 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, break; } - /* Drop the ref which was taken in get_user_pages() */ - put_page(page); + /* Drop the ref which was taken in [get|pin]_user_pages() */ + dio_w_unpin_user_page(page); } out: return ret; @@ -1288,7 +1288,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - put_page(sdio.cur_page); + dio_w_unpin_user_page(sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4eb559a16c9ed..fc7763c418d12 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -202,7 +202,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); + dio_w_pin_user_page(page); __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } From bcf9242578b639e93a10f143b214938dd9155a0a Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:42 -0700 Subject: [PATCH 008/352] NFS: direct-io: convert to FOLL_PIN pages Convert the NFS Direct IO layer to use pin_user_pages_fast() and unpin_user_page(), instead of get_user_pages_fast() and put_page(). The user of pin_user_pages_fast() depends upon: 1) CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO, and 2) User-space-backed pages: user_backed_iter(i) == true Link: https://lkml.kernel.org/r/20220831041843.973026-7-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- fs/nfs/direct.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 1707f46b1335c..71b794f39ee26 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -142,11 +142,13 @@ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter) return 0; } -static void nfs_direct_release_pages(struct page **pages, unsigned int npages) +static void nfs_direct_release_pages(struct iov_iter *iter, struct page **pages, + unsigned int npages) { - unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + if (user_backed_iter(iter) || iov_iter_is_bvec(iter)) + dio_w_unpin_user_pages(pages, npages); + else + release_pages(pages, npages); } void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, @@ -332,11 +334,11 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, size_t pgbase; unsigned npages, i; - result = iov_iter_get_pages_alloc2(iter, &pagevec, + result = dio_w_iov_iter_pin_pages_alloc(iter, &pagevec, rsize, &pgbase); if (result < 0) break; - + bytes = result; npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE; for (i = 0; i < npages; i++) { @@ -362,7 +364,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, pos += req_len; dreq->bytes_left -= req_len; } - nfs_direct_release_pages(pagevec, npages); + nfs_direct_release_pages(iter, pagevec, npages); kvfree(pagevec); if (result < 0) break; @@ -791,8 +793,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t pgbase; unsigned npages, i; - result = iov_iter_get_pages_alloc2(iter, &pagevec, - wsize, &pgbase); + result = dio_w_iov_iter_pin_pages_alloc(iter, &pagevec, + wsize, &pgbase); if (result < 0) break; @@ -829,7 +831,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, pos += req_len; dreq->bytes_left -= req_len; } - nfs_direct_release_pages(pagevec, npages); + nfs_direct_release_pages(iter, pagevec, npages); kvfree(pagevec); if (result < 0) break; From f38acb2e55a2fd551c3b3acc77cf955eb6f472a0 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 30 Aug 2022 21:18:43 -0700 Subject: [PATCH 009/352] fuse: convert direct IO paths to use FOLL_PIN Convert the fuse filesystem to use pin_user_pages_fast() and unpin_user_page(), instead of get_user_pages_fast() and put_page(). The user of pin_user_pages_fast() depends upon: 1) CONFIG_BLK_USE_PIN_USER_PAGES_FOR_DIO, and 2) User-space-backed pages or ITER_BVEC pages. Link: https://lkml.kernel.org/r/20220831041843.973026-8-jhubbard@nvidia.com Signed-off-by: John Hubbard Cc: Alexander Viro Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: David Hildenbrand Cc: Jan Kara Cc: Jens Axboe Cc: Logan Gunthorpe Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 11 +++++++++-- fs/fuse/file.c | 32 +++++++++++++++++++++----------- fs/fuse/fuse_i.h | 1 + 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945a..a1c6a152323f9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -675,7 +675,12 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); } - put_page(cs->pg); + if (!cs->pipebufs && + (user_backed_iter(cs->iter) || iov_iter_is_bvec(cs->iter))) + dio_w_unpin_user_page(cs->pg); + + else + put_page(cs->pg); } cs->pg = NULL; } @@ -730,7 +735,9 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) } } else { size_t off; - err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off); + + err = dio_w_iov_iter_pin_pages(cs->iter, &page, PAGE_SIZE, 1, + &off); if (err < 0) return err; BUG_ON(!err); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1a3afd469e3a9..01da38928d0b6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -625,14 +625,19 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, } static void fuse_release_user_pages(struct fuse_args_pages *ap, - bool should_dirty) + bool should_dirty, bool is_user_or_bvec) { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { - if (should_dirty) - set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + if (is_user_or_bvec) { + dio_w_unpin_user_pages_dirty_lock(ap->pages, ap->num_pages, + should_dirty); + } else { + for (i = 0; i < ap->num_pages; i++) { + if (should_dirty) + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); + } } } @@ -733,7 +738,7 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, io->should_dirty, io->is_user_or_bvec); if (err) { /* Nothing */ @@ -1414,10 +1419,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + ret = dio_w_iov_iter_pin_pages(ii, &ap->pages[ap->num_pages], + *nbytesp - nbytes, + max_pages - ap->num_pages, + &start); if (ret < 0) break; @@ -1483,6 +1488,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); + /* For use in fuse_release_user_pages(): */ + io->is_user_or_bvec = user_backed_iter(iter) || + iov_iter_is_bvec(iter); + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, max_pages); if (err && !nbytes) @@ -1498,7 +1507,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, io->should_dirty, + io->is_user_or_bvec); fuse_io_free(ia); } ia = NULL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 488b460e046f4..6ee7f72e29ebf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -290,6 +290,7 @@ struct fuse_io_priv { struct kiocb *iocb; struct completion *done; bool blocking; + bool is_user_or_bvec; }; #define FUSE_IO_PRIV_SYNC(i) \ From c22866647d25f97c2dff6f04fb5e2f51e225731e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:45 +0800 Subject: [PATCH 010/352] mm: introduce common struct mm_slot Patch series "add common struct mm_slot and use it in THP and KSM", v2. At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patchset introduces a common struct mm_slot, and lets THP and KSM to use it. This patch (of 7): At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patch introduces a common struct mm_slot, and subsequent patches will let THP and KSM to use it. Link: https://lkml.kernel.org/r/20220831031951.43152-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220831031951.43152-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/mm_slot.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 mm/mm_slot.h diff --git a/mm/mm_slot.h b/mm/mm_slot.h new file mode 100644 index 0000000000000..83f18ed1c4bde --- /dev/null +++ b/mm/mm_slot.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_MM_SLOT_H +#define _LINUX_MM_SLOT_H + +#include +#include + +/* + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: link to the mm_slots hash list + * @mm_node: link into the mm_slots list + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +#define mm_slot_entry(ptr, type, member) \ + container_of(ptr, type, member) + +static inline void *mm_slot_alloc(struct kmem_cache *cache) +{ + if (!cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(cache, GFP_KERNEL); +} + +static inline void mm_slot_free(struct kmem_cache *cache, void *objp) +{ + kmem_cache_free(cache, objp); +} + +#define mm_slot_lookup(_hashtable, _mm) \ +({ \ + struct mm_slot *tmp_slot, *mm_slot = NULL; \ + \ + hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ + if (_mm == tmp_slot->mm) { \ + mm_slot = tmp_slot; \ + break; \ + } \ + \ + mm_slot; \ +}) + +#define mm_slot_insert(_hashtable, _mm, _mm_slot) \ +({ \ + _mm_slot->mm = _mm; \ + hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ +}) + +#endif /* _LINUX_MM_SLOT_H */ From 66ed82a276a294f801b5a0a37c7313ffed8d668c Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:46 +0800 Subject: [PATCH 011/352] mm: thp: convert to use common struct mm_slot Rename private struct mm_slot to struct khugepaged_mm_slot and convert to use common struct mm_slot with no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 121 ++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 70 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7c13d65aeb14e..2e0cf7616a8ef 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, @@ -99,17 +100,13 @@ struct collapse_control { }; /** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @nr_pte_mapped_thp: number of pte mapped THP * @pte_mapped_thp: address array corresponding pte mapped THP */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct khugepaged_mm_slot { + struct mm_slot slot; /* pte-mapped THP in this mm */ int nr_pte_mapped_thp; @@ -126,7 +123,7 @@ struct mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; unsigned long address; }; @@ -390,8 +387,9 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -408,36 +406,6 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; @@ -445,28 +413,31 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; + slot = &mm_slot->slot; + /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); return; } spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); @@ -486,21 +457,23 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* @@ -1318,16 +1291,17 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. @@ -1336,7 +1310,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } @@ -1349,12 +1323,14 @@ static void collect_mm_slot(struct mm_slot *mm_slot) static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); @@ -1486,9 +1462,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto drop_hpage; } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) @@ -2051,7 +2028,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; @@ -2060,18 +2038,20 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) + if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); - mm = mm_slot->mm; + mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. @@ -2166,10 +2146,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; @@ -2264,7 +2245,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); From 6b1d6be007516c25ce7a6d742f912a72a555b774 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 1 Sep 2022 15:44:42 +0800 Subject: [PATCH 012/352] mm: thp: fix build error with CONFIG_SHMEM disabled Fix the following build error by changing function parameter type from struct mm_sot * to struct khugepaged_mm_slot *: mm/khugepaged.c: In function 'khugepaged_scan_mm_slot': >> mm/khugepaged.c:2056:45: error: passing argument 1 of 'khugepaged_collapse_pte_mapped_thps' from incompatible pointer type [-Werror=incompatible-pointer-types] 2056 | khugepaged_collapse_pte_mapped_thps(mm_slot); | ^~~~~~~ | | | struct khugepaged_mm_slot * mm/khugepaged.c:2023:65: note: expected 'struct mm_slot *' but argument is of type 'struct khugepaged_mm_slot *' 2023 | static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) | ~~~~~~~~~~~~~~~~^~~~~~~ cc1: some warnings being treated as errors Link: https://lkml.kernel.org/r/639fa8d5-8e5b-2333-69dc-40ed46219364@bytedance.com Signed-off-by: Qi Zheng Reported-by: kernel test robot Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2e0cf7616a8ef..1e59fe7bfae36 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2017,7 +2017,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, BUILD_BUG(); } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } #endif From 80c18aeafd14c42af48435b54787f7f95b6dc12c Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:47 +0800 Subject: [PATCH 013/352] ksm: remove redundant declarations in ksm.h Currently, for struct stable_node, no one uses it in both the include/linux/ksm.h file and the file that contains it. For struct mem_cgroup, it's also not used in ksm.h. So they're all redundant, just remove them. Link: https://lkml.kernel.org/r/20220831031951.43152-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/ksm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0b4f17418f64c..7e232ba59b865 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -15,9 +15,6 @@ #include #include -struct stable_node; -struct mem_cgroup; - #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); From bc6a2828a963dd8179a277a609ab09ac7fc4aa87 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:48 +0800 Subject: [PATCH 014/352] ksm: add the ksm prefix to the names of the ksm private structures In order to prevent the name of the private structure of ksm from being the same as the name of the common structure used in subsequent patches, prefix their names with ksm in advance. Link: https://lkml.kernel.org/r/20220831031951.43152-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/mm/ksm.rst | 2 +- mm/ksm.c | 216 +++++++++++++++++++-------------------- 2 files changed, 109 insertions(+), 109 deletions(-) diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst index 9e37add068e64..f83cfbc12f4ca 100644 --- a/Documentation/mm/ksm.rst +++ b/Documentation/mm/ksm.rst @@ -26,7 +26,7 @@ tree. If a KSM page is shared between less than ``max_page_sharing`` VMAs, the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the +list of struct ksm_rmap_item and the ``page->mapping`` of the KSM page points to the stable tree node. When the sharing passes this threshold, KSM adds a second dimension to diff --git a/mm/ksm.c b/mm/ksm.c index 0cd2f4b623345..de61946106ce4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -82,7 +82,7 @@ * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented - * using the same struct stable_node structure. + * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to @@ -112,16 +112,16 @@ */ /** - * struct mm_slot - ksm information per mm that is being scanned + * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ -struct mm_slot { +struct ksm_mm_slot { struct hlist_node link; struct list_head mm_list; - struct rmap_item *rmap_list; + struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -135,14 +135,14 @@ struct mm_slot { * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; unsigned long address; - struct rmap_item **rmap_list; + struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** - * struct stable_node - node of the stable rbtree + * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain @@ -153,7 +153,7 @@ struct ksm_scan { * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ -struct stable_node { +struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ @@ -182,7 +182,7 @@ struct stable_node { }; /** - * struct rmap_item - reverse mapping item for virtual addresses + * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) @@ -193,8 +193,8 @@ struct stable_node { * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node */ -struct rmap_item { - struct rmap_item *rmap_list; +struct ksm_rmap_item { + struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA @@ -207,7 +207,7 @@ struct rmap_item { union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ - struct stable_node *head; + struct ksm_stable_node *head; struct hlist_node hlist; }; }; @@ -230,7 +230,7 @@ static LIST_HEAD(migrate_nodes); #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct mm_slot ksm_mm_head = { +static struct ksm_mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; static struct ksm_scan ksm_scan = { @@ -298,21 +298,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) static int __init ksm_slab_init(void) { - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; @@ -334,18 +334,18 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } -static __always_inline bool is_stable_node_chain(struct stable_node *chain) +static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } -static __always_inline bool is_stable_node_dup(struct stable_node *dup) +static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } -static inline void stable_node_chain_add_dup(struct stable_node *dup, - struct stable_node *chain) +static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, + struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; @@ -354,14 +354,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup, ksm_stable_node_dups++; } -static inline void __stable_node_dup_del(struct stable_node *dup) +static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } -static inline void stable_node_dup_del(struct stable_node *dup) +static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) @@ -373,9 +373,9 @@ static inline void stable_node_dup_del(struct stable_node *dup) #endif } -static inline struct rmap_item *alloc_rmap_item(void) +static inline struct ksm_rmap_item *alloc_rmap_item(void) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); @@ -384,7 +384,7 @@ static inline struct rmap_item *alloc_rmap_item(void) return rmap_item; } -static inline void free_rmap_item(struct rmap_item *rmap_item) +static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; @@ -392,7 +392,7 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) kmem_cache_free(rmap_item_cache, rmap_item); } -static inline struct stable_node *alloc_stable_node(void) +static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under @@ -402,28 +402,28 @@ static inline struct stable_node *alloc_stable_node(void) return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } -static inline void free_stable_node(struct stable_node *stable_node) +static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } -static inline struct mm_slot *alloc_mm_slot(void) +static inline struct ksm_mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ return NULL; return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); } -static inline void free_mm_slot(struct mm_slot *mm_slot) +static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) { kmem_cache_free(mm_slot_cache, mm_slot); } -static struct mm_slot *get_mm_slot(struct mm_struct *mm) +static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { - struct mm_slot *slot; + struct ksm_mm_slot *slot; hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) if (slot->mm == mm) @@ -433,7 +433,7 @@ static struct mm_slot *get_mm_slot(struct mm_struct *mm) } static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) + struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); @@ -529,7 +529,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, return vma; } -static void break_cow(struct rmap_item *rmap_item) +static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -548,7 +548,7 @@ static void break_cow(struct rmap_item *rmap_item) mmap_read_unlock(mm); } -static struct page *get_mergeable_page(struct rmap_item *rmap_item) +static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -589,10 +589,10 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } -static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, +static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { - struct stable_node *chain = alloc_stable_node(); + struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); @@ -622,7 +622,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, return chain; } -static inline void free_stable_node_chain(struct stable_node *chain, +static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); @@ -630,9 +630,9 @@ static inline void free_stable_node_chain(struct stable_node *chain, ksm_stable_node_chains--; } -static void remove_node_from_stable_tree(struct stable_node *stable_node) +static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); @@ -694,7 +694,7 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, +static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; @@ -773,10 +773,10 @@ static struct page *get_ksm_page(struct stable_node *stable_node, * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *page; stable_node = rmap_item->head; @@ -823,10 +823,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { - struct rmap_item *rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); @@ -863,18 +863,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct stable_node *folio_stable_node(struct folio *folio) +static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct stable_node *page_stable_node(struct page *page) +static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) + struct ksm_stable_node *stable_node) { VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); @@ -884,7 +884,7 @@ static inline void set_page_stable_node(struct page *page, /* * Only called through the sysfs control interface: */ -static int remove_stable_node(struct stable_node *stable_node) +static int remove_stable_node(struct ksm_stable_node *stable_node) { struct page *page; int err; @@ -922,10 +922,10 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } -static int remove_stable_node_chain(struct stable_node *stable_node, +static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -949,14 +949,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node, static int remove_all_stable_nodes(void) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, - struct stable_node, node); + struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; @@ -975,14 +975,14 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1007,7 +1007,7 @@ static int unmerge_and_remove_all_rmap_items(void) spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); @@ -1295,7 +1295,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, +static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; @@ -1332,9 +1332,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, +static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, - struct rmap_item *tree_rmap_item, + struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; @@ -1354,7 +1354,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, } static __always_inline -bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* @@ -1368,17 +1368,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) } static __always_inline -bool is_page_sharing_candidate(struct stable_node *stable_node) +bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct page *_tree_page, *tree_page = NULL; int nr = 0; @@ -1492,7 +1492,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, return tree_page; } -static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, +static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, struct rb_root *root) { if (!is_stable_node_chain(stable_node)) @@ -1519,12 +1519,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *stable_node = *_stable_node; + struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; @@ -1541,18 +1541,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct stable_node **s_n_d, - struct stable_node **s_n, +static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct stable_node **s_n_d, - struct stable_node *s_n, +static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, struct rb_root *root) { - struct stable_node *old_stable_node = s_n; + struct ksm_stable_node *old_stable_node = s_n; struct page *tree_page; tree_page = __stable_node_chain(s_n_d, &s_n, root, false); @@ -1576,8 +1576,8 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; - struct stable_node *page_node; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *page_node; page_node = page_stable_node(page); if (page_node && page_node->head != &migrate_nodes) { @@ -1597,7 +1597,7 @@ static struct page *stable_tree_search(struct page *page) int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain_prune(&stable_node_dup, &stable_node, root); /* @@ -1820,14 +1820,14 @@ static struct page *stable_tree_search(struct page *page) * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct page *kpage) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; kpfn = page_to_pfn(kpage); @@ -1842,7 +1842,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { @@ -1911,7 +1911,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { - struct stable_node *orig = stable_node; + struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { @@ -1940,7 +1940,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) * the same walking algorithm in an rbtree. */ static -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, +struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { @@ -1954,12 +1954,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, new = &root->rb_node; while (*new) { - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); - tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; @@ -2011,8 +2011,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ -static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node, +static void stable_tree_append(struct ksm_rmap_item *rmap_item, + struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* @@ -2054,12 +2054,12 @@ static void stable_tree_append(struct rmap_item *rmap_item, * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; @@ -2215,11 +2215,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } } -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct rmap_item **rmap_list, +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, unsigned long addr) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; @@ -2244,12 +2244,12 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, return rmap_item; } -static struct rmap_item *scan_get_next_rmap_item(struct page **page) +static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct mm_slot *slot; + struct ksm_mm_slot *slot; struct vm_area_struct *vma; - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; @@ -2277,7 +2277,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct page *page; list_for_each_entry_safe(stable_node, next, @@ -2294,7 +2294,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2368,7 +2368,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2414,7 +2414,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) */ static void ksm_do_scan(unsigned int scan_npages) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { @@ -2518,7 +2518,7 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int needs_wakeup; mm_slot = alloc_mm_slot(); @@ -2557,7 +2557,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int easy_to_free = 0; /* @@ -2635,8 +2635,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { - struct stable_node *stable_node; - struct rmap_item *rmap_item; + struct ksm_stable_node *stable_node; + struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); @@ -2706,7 +2706,7 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); @@ -2739,7 +2739,7 @@ static void wait_while_offlining(void) } } -static bool stable_node_dup_remove_range(struct stable_node *stable_node, +static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { @@ -2755,12 +2755,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node, return false; } -static bool stable_node_chain_remove_range(struct stable_node *stable_node, +static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -2784,14 +2784,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node, static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { - stable_node = rb_entry(node, struct stable_node, node); + stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + From c3dd9346b7837672e2fa154c61127fa9ee2aaf41 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:49 +0800 Subject: [PATCH 015/352] ksm: convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node In order to use common struct mm_slot, convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-6-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index de61946106ce4..f9cd502233f09 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -114,13 +114,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { struct hlist_node link; - struct list_head mm_list; + struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -231,7 +231,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), + .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -981,8 +981,8 @@ static int unmerge_and_remove_all_rmap_items(void) int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, + struct ksm_mm_slot, mm_node); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1006,11 +1006,11 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(mm_slot); @@ -2253,7 +2253,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_list)) + if (list_empty(&ksm_mm_head.mm_node)) return NULL; slot = ksm_scan.mm_slot; @@ -2294,7 +2294,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); + slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2367,8 +2367,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2380,7 +2380,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); - list_del(&slot->mm_list); + list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(slot); @@ -2429,7 +2429,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2526,7 +2526,7 @@ int __ksm_enter(struct mm_struct *mm) return -ENOMEM; /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + needs_wakeup = list_empty(&ksm_mm_head.mm_node); spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); @@ -2541,9 +2541,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2574,11 +2574,11 @@ void __ksm_exit(struct mm_struct *mm) if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + list_move(&mm_slot->mm_node, + &ksm_scan.mm_slot->mm_node); } } spin_unlock(&ksm_mmlist_lock); From 157b4a4cdf2179f3c1a002a2f95735980089ef3b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:50 +0800 Subject: [PATCH 016/352] ksm: convert ksm_mm_slot.link to ksm_mm_slot.hash In order to use common struct mm_slot, convert ksm_mm_slot.link to ksm_mm_slot.hash in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f9cd502233f09..9300e7a48e887 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -113,13 +113,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @link: link to the mm_slots hash list + * @hash: link to the mm_slots hash list * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node link; + struct hlist_node hash; struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; @@ -425,7 +425,7 @@ static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { struct ksm_mm_slot *slot; - hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) if (slot->mm == mm) return slot; @@ -436,7 +436,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); + hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); } /* @@ -1009,7 +1009,7 @@ static int unmerge_and_remove_all_rmap_items(void) ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2379,7 +2379,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->link); + hash_del(&slot->hash); list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2573,7 +2573,7 @@ void __ksm_exit(struct mm_struct *mm) mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); easy_to_free = 1; } else { From a98e8754dfa0614aa7b776beebd453d9c384f78a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:51 +0800 Subject: [PATCH 017/352] ksm: convert to use common struct mm_slot Convert to use common struct mm_slot, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 132 +++++++++++++++++++++++-------------------------------- 1 file changed, 56 insertions(+), 76 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9300e7a48e887..c3edb5836a441 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -42,6 +42,7 @@ #include #include "internal.h" +#include "mm_slot.h" #ifdef CONFIG_NUMA #define NUMA(x) (x) @@ -113,16 +114,12 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @hash: link to the mm_slots hash list - * @mm_node: link into the mm_slots list, rooted in ksm_mm_head + * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items - * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node hash; - struct list_head mm_node; + struct mm_slot slot; struct ksm_rmap_item *rmap_list; - struct mm_struct *mm; }; /** @@ -231,7 +228,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), + .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -409,36 +406,6 @@ static inline void free_stable_node(struct ksm_stable_node *stable_node) kmem_cache_free(stable_node_cache, stable_node); } -static inline struct ksm_mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct ksm_mm_slot *slot; - - hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) - if (slot->mm == mm) - return slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct ksm_mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -976,20 +943,22 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(ksm_mm_head.slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { - VMA_ITERATOR(vmi, mm_slot->mm, 0); + VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); - mm = mm_slot->mm; + mm = mm_slot->slot.mm; mmap_read_lock(mm); for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) @@ -1006,14 +975,15 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else @@ -2235,7 +2205,7 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ - rmap_item->mm = mm_slot->mm; + rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; @@ -2247,17 +2217,18 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct ksm_mm_slot *slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_node)) + if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; - slot = ksm_scan.mm_slot; - if (slot == &ksm_mm_head) { + mm_slot = ksm_scan.mm_slot; + if (mm_slot == &ksm_mm_head) { /* * A number of pages can hang around indefinitely on per-cpu * pagevecs, raised page count preventing write_protect_page @@ -2294,20 +2265,23 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); - ksm_scan.mm_slot = slot; + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ - if (slot == &ksm_mm_head) + if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } + slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); @@ -2337,7 +2311,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); - rmap_item = get_next_rmap_item(slot, + rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = @@ -2358,7 +2332,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: @@ -2367,8 +2341,9 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2379,11 +2354,11 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->hash); - list_del(&slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); @@ -2400,8 +2375,8 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } /* Repeat until we've completed scanning the whole list */ - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + mm_slot = ksm_scan.mm_slot; + if (mm_slot != &ksm_mm_head) goto next_mm; ksm_scan.seqnr++; @@ -2429,7 +2404,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2519,17 +2494,20 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int needs_wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; + slot = &mm_slot->slot; + /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_node); + needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle @@ -2541,9 +2519,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); + list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else - list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); + list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2558,6 +2536,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int easy_to_free = 0; /* @@ -2570,21 +2549,22 @@ void __ksm_exit(struct mm_struct *mm) */ spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_node, - &ksm_scan.mm_slot->mm_node); + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { From d3072aba7ce6bf57896b18fc4528214e3130ca3c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:57 +0100 Subject: [PATCH 018/352] mm/vmscan: fix a lot of comments Patch series "MM folio changes for 6.1", v2. My focus this round has been on shmem. I believe it is now fully converted to folios. Of course, shmem interacts with a lot of the swap cache and other parts of the kernel, so there are patches all over the MM. This patch series survives a round of xfstests on tmpfs, which is nice, but hardly an exhaustive test. Hugh was nice enough to run a round of tests on it and found a bug which is fixed in this edition. This patch (of 57): A lot of comments mention pages when they should say folios. Fix them up. Link: https://lkml.kernel.org/r/20220902194653.1739778-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220902194653.1739778-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/vmscan.c | 253 ++++++++++++++++++++++++++-------------------------- 1 file changed, 125 insertions(+), 128 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ba9423b141de..d040b47f54305 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -90,7 +90,7 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; - /* Can active pages be deactivated as part of reclaim? */ + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; @@ -100,10 +100,10 @@ struct scan_control { /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; - /* Can mapped pages be reclaimed? */ + /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; - /* Can pages be swapped as part of reclaim? */ + /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Proactive reclaim invoked by userspace through memory.reclaim */ @@ -128,7 +128,7 @@ struct scan_control { /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; - /* The file pages on the current node are dangerously low */ + /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ @@ -146,7 +146,7 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ s8 priority; - /* The highest zone to isolate pages for reclaim from */ + /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ @@ -454,7 +454,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in - * shrink_page_list() is used for throttling instead, which lacks all the + * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * @@ -575,9 +575,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, } /* - * This misses isolated pages which are not accounted for to save counters. + * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is - * not expected that isolated pages will be a dominating factor. + * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { @@ -1050,9 +1050,9 @@ void drop_slab(void) static inline int is_page_cache_freeable(struct folio *folio) { /* - * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache and optional buffer - * heads at page->private. + * A freeable page cache folio is referenced only by the caller + * that isolated the folio, the page cache and optional filesystem + * private data at folio->private. */ return folio_ref_count(folio) - folio_test_private(folio) == 1 + folio_nr_pages(folio); @@ -1092,8 +1092,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) return true; /* - * If there are a lot of dirty/writeback pages then do not - * throttle as throttling will occur when the pages cycle + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1136,7 +1136,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from - * writeback to a slow device to excessive references pages at the tail + * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { @@ -1182,8 +1182,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } /* - * Account for pages written if tasks are throttled waiting on dirty - * pages to clean. If enough pages have been cleaned since throttling + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -1209,18 +1209,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, /* possible outcome of pageout() */ typedef enum { - /* failed to write page out, page is locked */ + /* failed to write folio out, folio is locked */ PAGE_KEEP, - /* move page to the active list, page is locked */ + /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ + /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, - /* page is clean and locked */ + /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; /* - * pageout is called by shrink_page_list() for each dirty page. + * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, @@ -1294,7 +1294,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } /* - * Same as remove_mapping, but if the page is removed from the mapping, it + * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, @@ -1310,34 +1310,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* - * The non racy check for a busy page. + * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has - * a ref to the page, it may be possible that they dirty it then - * drop the reference. So if PageDirty is tested before page_count - * here, then the following race may occur: + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); - * !PageDirty(page) [good] - * SetPageDirty(page); - * put_page(page); - * !page_count(page) [good, discard it] + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot - * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_refcount. + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. * - * Note that if SetPageDirty is always performed via set_page_dirty, + * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; @@ -1368,7 +1368,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, * back. * * We also don't store shadows for DAX mappings because the - * only page cache pages found in these are zero pages + * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. @@ -1436,14 +1436,14 @@ void folio_putback_lru(struct folio *folio) folio_put(folio); /* drop ref from isolate */ } -enum page_references { - PAGEREF_RECLAIM, - PAGEREF_RECLAIM_CLEAN, - PAGEREF_KEEP, - PAGEREF_ACTIVATE, +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, }; -static enum page_references folio_check_references(struct folio *folio, +static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; @@ -1458,11 +1458,11 @@ static enum page_references folio_check_references(struct folio *folio, * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* rmap lock contention: rotate */ if (referenced_ptes == -1) - return PAGEREF_KEEP; + return FOLIOREF_KEEP; if (referenced_ptes) { /* @@ -1482,34 +1482,34 @@ static enum page_references folio_check_references(struct folio *folio, folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; - return PAGEREF_KEEP; + return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) - return PAGEREF_RECLAIM_CLEAN; + return FOLIOREF_RECLAIM_CLEAN; - return PAGEREF_RECLAIM; + return FOLIOREF_RECLAIM; } -/* Check if a page is dirty or under writeback */ +/* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* - * Anonymous pages are not handled by flushers and must be written + * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. - * MADV_FREE anonymous pages are put into inactive file list too. + * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ @@ -1564,11 +1564,10 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) } /* - * Take pages on @demote_list and attempt to demote them to - * another node. Pages which are not demoted are left on - * @demote_pages. + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. */ -static unsigned int demote_page_list(struct list_head *demote_pages, +static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); @@ -1587,7 +1586,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, .nmask = &allowed_mask }; - if (list_empty(demote_pages)) + if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) @@ -1596,7 +1595,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_pages, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_page, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1625,17 +1624,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) } /* - * shrink_page_list() returns the number of reclaimed pages + * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) -{ - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - LIST_HEAD(demote_pages); +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references) +{ + LIST_HEAD(ret_folios); + LIST_HEAD(free_folios); + LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; @@ -1646,16 +1643,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, do_demote_pass = can_demote(pgdat->node_id, sc); retry: - while (!list_empty(page_list)) { + while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; - enum page_references references = PAGEREF_RECLAIM; + enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); - folio = lru_to_folio(page_list); + folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) @@ -1779,7 +1776,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ - list_add_tail(&folio->lru, page_list); + list_add_tail(&folio->lru, folio_list); continue; } } @@ -1788,13 +1785,13 @@ static unsigned int shrink_page_list(struct list_head *page_list, references = folio_check_references(folio, sc); switch (references) { - case PAGEREF_ACTIVATE: + case FOLIOREF_ACTIVATE: goto activate_locked; - case PAGEREF_KEEP: + case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } @@ -1804,7 +1801,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { - list_add(&folio->lru, &demote_pages); + list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } @@ -1831,7 +1828,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { @@ -1839,7 +1836,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); @@ -1851,7 +1848,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ - if (split_folio_to_list(folio, page_list)) + if (split_folio_to_list(folio, folio_list)) goto keep_locked; } @@ -1916,7 +1913,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, goto activate_locked; } - if (references == PAGEREF_RECLAIM_CLEAN) + if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; @@ -2029,13 +2026,13 @@ static unsigned int shrink_page_list(struct list_head *page_list, nr_reclaimed += nr_pages; /* - * Is there need to periodically free_page_list? It would + * Is there need to periodically free_folio_list? It would * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) destroy_large_folio(folio); else - list_add(&folio->lru, &free_pages); + list_add(&folio->lru, &free_folios); continue; activate_locked_split: @@ -2063,29 +2060,29 @@ static unsigned int shrink_page_list(struct list_head *page_list, keep_locked: folio_unlock(folio); keep: - list_add(&folio->lru, &ret_pages); + list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } - /* 'page_list' is always empty here */ + /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Folios that could not be demoted are still in @demote_pages */ - if (!list_empty(&demote_pages)) { - /* Folios which weren't demoted go back on @page_list for retry: */ - list_splice_init(&demote_pages, page_list); + nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list for retry: */ + list_splice_init(&demote_folios, folio_list); do_demote_pass = false; goto retry; } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_pages); + mem_cgroup_uncharge_list(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_pages); + free_unref_page_list(&free_folios); - list_splice(&ret_pages, page_list); + list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) @@ -2094,7 +2091,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *folio_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2122,7 +2119,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); @@ -2181,7 +2178,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2288,8 +2285,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * * Context: * - * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages() (which is called + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. @@ -2361,13 +2358,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); @@ -2387,7 +2384,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: - * #0 move_pages_to_lru #1 release_pages + * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock @@ -2444,11 +2441,11 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) { - LIST_HEAD(page_list); + LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; @@ -2475,7 +2472,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2490,10 +2487,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &page_list); + move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; @@ -2504,16 +2501,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); - mem_cgroup_uncharge_list(&page_list); - free_unref_page_list(&page_list); + mem_cgroup_uncharge_list(&folio_list); + free_unref_page_list(&folio_list); /* - * If dirty pages are scanned that are not queued for IO, it + * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of + * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory + * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. @@ -2572,7 +2569,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2632,8 +2629,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ spin_lock_irq(&lruvec->lru_lock); - nr_activate = move_pages_to_lru(lruvec, &l_active); - nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); @@ -2649,7 +2646,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -static unsigned int reclaim_page_list(struct list_head *page_list, +static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat dummy_stat; @@ -2663,9 +2660,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list, .no_demotion = 1, }; - nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); - while (!list_empty(page_list)) { - folio = lru_to_folio(page_list); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } @@ -2695,11 +2692,11 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -2729,13 +2726,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive - * page has a chance to be referenced again before it is reclaimed. + * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * - * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio - * of 3 means 3:1 or 25% of the pages are kept on the inactive list. + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive @@ -2884,8 +2881,8 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) * Determine how aggressively the anon and file LRU lists should be * scanned. * - * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan - * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2900,7 +2897,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; - /* If we have no swap space, do not bother scanning anon pages. */ + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; From 7a740879db14854db1eb0f6f1ca30a798a84c0d4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 2 Sep 2022 15:39:50 -0700 Subject: [PATCH 019/352] mm-vmscan-fix-a-lot-of-comments-vs-mglru fixups for mglru additions Cc: "Matthew Wilcox (Oracle)" Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d040b47f54305..9ce6cc74d9eae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3644,7 +3644,7 @@ static int folio_update_gen(struct folio *folio, int gen) do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_page_list() */ + /* for shrink_folio_list() */ new_flags = old_flags | BIT(PG_referenced); continue; } @@ -4571,7 +4571,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } /* - * This function exploits spatial locality when shrink_page_list() walks the + * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the @@ -4792,7 +4792,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); - /* for shrink_page_list() */ + /* for shrink_folio_list() */ folio_clear_reclaim(folio); folio_clear_referenced(folio); @@ -4995,7 +4995,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; - reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); list_for_each_entry(folio, &list, lru) { /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ @@ -5012,7 +5012,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &list); + move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) From ae0b72ec3a112090fff6407ba1e1c5a130941bcd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:58 +0100 Subject: [PATCH 020/352] mm: add the first tail page to struct folio Some of the static checkers get confused by extracting the page from the folio and referring to fields in the first tail page. Adding these fields to struct folio lets us avoid doing that. It has the risk that people will refer to those fields without checking that the folio is actually a large folio, so prefix them with underscores and document the preferred function to use instead. Link: https://lkml.kernel.org/r/20220902194653.1739778-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f30f262431c9..5c87d0f292a23 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -245,6 +245,13 @@ struct page { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @_flags_1: For large folios, additional page flags. + * @__head: Points to the folio. Do not use. + * @_folio_dtor: Which destructor to use for this folio. + * @_folio_order: Do not use directly, call folio_order(). + * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). + * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -283,9 +290,17 @@ struct folio { }; struct page page; }; + unsigned long _flags_1; + unsigned long __head; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; +#ifdef CONFIG_64BIT + unsigned int _folio_nr_pages; +#endif }; -static_assert(sizeof(struct page) == sizeof(struct folio)); #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); @@ -300,6 +315,19 @@ FOLIO_MATCH(_refcount, _refcount); FOLIO_MATCH(memcg_data, memcg_data); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + sizeof(struct page)) +FOLIO_MATCH(flags, _flags_1); +FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_dtor, _folio_dtor); +FOLIO_MATCH(compound_order, _folio_order); +FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_pincount, _pincount); +#ifdef CONFIG_64BIT +FOLIO_MATCH(compound_nr, _folio_nr_pages); +#endif +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { From 81d05d2a1f10b2b8c901d53a49c2703d6c57e571 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:59 +0100 Subject: [PATCH 021/352] mm: reimplement folio_order() and folio_nr_pages() Instead of calling compound_order() and compound_nr_pages(), use the folio directly. Saves 1905 bytes from mm/filemap.o due to folio_test_large() now being a cheaper check than PageHead(). Link: https://lkml.kernel.org/r/20220902194653.1739778-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bb0426faeb32f..364bcadb4d202 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,7 +729,9 @@ static inline unsigned int compound_order(struct page *page) */ static inline unsigned int folio_order(struct folio *folio) { - return compound_order(&folio->page); + if (!folio_test_large(folio)) + return 0; + return folio->_folio_order; } #include @@ -1659,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ static inline long folio_nr_pages(struct folio *folio) { - return compound_nr(&folio->page); + if (!folio_test_large(folio)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif } /** From df1176ce63b663ce8810d0f64deedd29348ca033 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:00 +0100 Subject: [PATCH 022/352] mm: add split_folio() This wrapper removes a need to use split_huge_page(&folio->page). Convert two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/shmem.c | 2 +- mm/truncate.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 38265f9f782e9..a1341fdcf666d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -444,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio, return split_huge_page_to_list(&folio->page, list); } +static inline int split_folio(struct folio *folio) +{ + return split_folio_to_list(folio, NULL); +} + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d8..674bde8b30850 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -629,7 +629,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, goto move_back; } - ret = split_huge_page(&folio->page); + ret = split_folio(folio); folio_unlock(folio); folio_put(folio); diff --git a/mm/truncate.c b/mm/truncate.c index 0b0708bf935f3..c0be77e5c0083 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_huge_page(&folio->page) == 0) + if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; From 8bec5af5242036ec47b11da004ac779ed0f809e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:01 +0100 Subject: [PATCH 023/352] mm: add folio_add_lru_vma() Convert lru_cache_add_inactive_or_unevictable() to folio_add_lru_vma() and add a compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-6-willy@infradead.org Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- mm/folio-compat.c | 6 ++++++ mm/swap.c | 19 +++++++++---------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6308150b234a4..2ede1e3695d9b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -379,11 +379,11 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -extern void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_pages); -extern void lru_note_cost_folio(struct folio *); -extern void folio_add_lru(struct folio *); -extern void lru_cache_add(struct page *); +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); +void lru_note_cost_folio(struct folio *); +void folio_add_lru(struct folio *); +void folio_add_lru_vma(struct folio *, struct vm_area_struct *); +void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 458618c7302c3..e1e23b4947d73 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -88,6 +88,12 @@ void lru_cache_add(struct page *page) } EXPORT_SYMBOL(lru_cache_add); +void lru_cache_add_inactive_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + folio_add_lru_vma(page_folio(page), vma); +} + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { diff --git a/mm/swap.c b/mm/swap.c index f1e89ca53faf9..b0485950068b3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -537,22 +537,21 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * lru_cache_add_inactive_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability + * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * @folio: The folio to be added to the LRU. + * @vma: VMA in which the folio is mapped. * - * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. + * If the VMA is mlocked, @folio is added to the unevictable list. + * Otherwise, it is treated the same way as folio_add_lru(). */ -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) +void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(page); + mlock_new_page(&folio->page); else - lru_cache_add(page); + folio_add_lru(folio); } /* From d282bb472c423c18869002448fd4341506068f0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:02 +0100 Subject: [PATCH 024/352] shmem: convert shmem_writepage() to use a folio throughout Even though we will split any large folio that comes in, write the code to handle large folios so as to not leave a trap for whoever tries to handle large folios in the swap cache. Link: https://lkml.kernel.org/r/20220902194653.1739778-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 674bde8b30850..3d2d35728793b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1328,17 +1328,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, * and its shmem_writeback() needs them to be split when swapping. */ - if (PageTransCompound(page)) { + if (folio_test_large(folio)) { /* Ensure the subpages are still dirty */ - SetPageDirty(page); + folio_test_set_dirty(folio); if (split_huge_page(page) < 0) goto redirty; - ClearPageDirty(page); + folio = page_folio(page); + folio_clear_dirty(folio); } - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) @@ -1361,15 +1362,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); @@ -1385,9 +1386,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } swap = folio_alloc_swap(folio); @@ -1396,7 +1397,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is + * if it's not already there. Do it now before the folio is * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until * we've incremented swapped, because shmem_unuse_inode() will @@ -1406,7 +1407,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, + if (add_to_swap_cache(&folio->page, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); @@ -1415,21 +1416,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); return 0; } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(page, swap); + put_swap_page(&folio->page, swap); redirty: - set_page_dirty(page); + folio_mark_dirty(folio); if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); return 0; } From 1c548f3bac09527046cd39e27a96f54bbc4e9d82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:03 +0100 Subject: [PATCH 025/352] shmem: convert shmem_delete_from_page_cache() to take a folio Remove the assertion that the page is not Compound as this function now handles large folios correctly. Link: https://lkml.kernel.org/r/20220902194653.1739778-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3d2d35728793b..9e851fc876016 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -763,23 +763,22 @@ static int shmem_add_to_page_cache(struct folio *folio, } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Like delete_from_page_cache, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageCompound(page), page); - xa_lock_irq(&mapping->i_pages); - error = shmem_replace_entry(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_lruvec_page_state(page, NR_FILE_PAGES); - __dec_lruvec_page_state(page, NR_SHMEM); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - put_page(page); + folio_put(folio); BUG_ON(error); } @@ -1416,7 +1415,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); From 0d364381d45a1889a8243dc193ab4c2ce31cb1e0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:04 +0100 Subject: [PATCH 026/352] shmem: convert shmem_replace_page() to use folios throughout Introduce folio_set_swap_entry() to abstract how both folio->private and swp_entry_t work. Use swap_address_space() directly instead of indirecting through folio_mapping(). Include an assertion that the old folio is not large as we only allocate a single-page folio to replace it. Use folio_put_refs() instead of calling folio_put() twice. Link: https://lkml.kernel.org/r/20220902194653.1739778-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++ mm/shmem.c | 67 +++++++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ede1e3695d9b..61e13d1a4caba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) +{ + folio->private = (void *)entry.val; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); diff --git a/mm/shmem.c b/mm/shmem.c index 9e851fc876016..4113f1b9d4a82 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1560,12 +1560,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { @@ -1617,51 +1611,49 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; int error; - oldpage = *pagep; - entry.val = page_private(oldpage); + old = page_folio(*pagep); + entry = folio_swap_entry(old); swap_index = swp_offset(entry); - swap_mapping = page_mapping(oldpage); + swap_mapping = swap_address_space(entry); /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) return -ENOMEM; - get_page(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); - SetPageUptodate(newpage); - set_page_private(newpage, entry.val); - SetPageSwapCache(newpage); + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); /* * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - old = page_folio(oldpage); - new = page_folio(newpage); mem_cgroup_migrate(old, new); - __inc_lruvec_page_state(newpage, NR_FILE_PAGES); - __inc_lruvec_page_state(newpage, NR_SHMEM); - __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); - __dec_lruvec_page_state(oldpage, NR_SHMEM); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1671,18 +1663,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ - oldpage = newpage; + old = new; } else { - lru_cache_add(newpage); - *pagep = newpage; + folio_add_lru(new); + *pagep = &new->page; } - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + folio_clear_swapcache(old); + old->private = NULL; - unlock_page(oldpage); - put_page(oldpage); - put_page(oldpage); + folio_unlock(old); + folio_put_refs(old, 2); return error; } @@ -2383,6 +2374,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return &shmem_alloc_folio(gfp, info, index)->page; +} + int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, From 138459f0ba3437645f882264e01b118799c4eb76 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:05 +0100 Subject: [PATCH 027/352] mm/swapfile: remove page_swapcount() By restructuring folio_swapped(), it can use swap_swapcount() instead of page_swapcount(). It's even a little more efficient. Link: https://lkml.kernel.org/r/20220902194653.1739778-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 469d9af86be2f..e0aaeac5c829d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1431,30 +1431,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) spin_unlock(&p->lock); } -/* - * How many references to page are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -static int page_swapcount(struct page *page) -{ - int count = 0; - struct swap_info_struct *p; - struct swap_cluster_info *ci; - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - p = _swap_info_get(entry); - if (p) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - count = swap_count(p->swap_map[offset]); - unlock_cluster_or_swap_info(p, ci); - } - return count; -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; @@ -1469,11 +1445,16 @@ int __swap_count(swp_entry_t entry) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { - int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; + int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); @@ -1574,17 +1555,16 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, static bool folio_swapped(struct folio *folio) { - swp_entry_t entry; - struct swap_info_struct *si; + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return page_swapcount(&folio->page) != 0; + return swap_swapcount(si, entry) != 0; - entry = folio_swap_entry(folio); - si = _swap_info_get(entry); - if (si) - return swap_page_trans_huge_swapped(si, entry); - return false; + return swap_page_trans_huge_swapped(si, entry); } /* From 13b80e58ae224f25a3feb6efe0125ea2ac4c8703 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:06 +0100 Subject: [PATCH 028/352] mm/swapfile: convert try_to_free_swap() to folio_free_swap() Add kernel-doc for folio_free_swap() and make it return bool. Add a try_to_free_swap() compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ++++++ mm/folio-compat.c | 7 +++++++ mm/swapfile.c | 32 ++++++++++++++++++-------------- mm/vmscan.c | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 61e13d1a4caba..dac6308d878e9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -490,6 +490,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); +bool folio_free_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -606,6 +607,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio) return entry; } +static inline bool folio_free_swap(struct folio *folio) +{ + return false; +} + static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d73..06d47f00609b5 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,3 +146,10 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_SWAP +int try_to_free_swap(struct page *page) +{ + return folio_free_swap(page_folio(page)); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index e0aaeac5c829d..f2a446799a393 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1567,43 +1567,47 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry); } -/* - * If swap is getting full, or if there are no more mappings of this page, - * then try_to_free_swap is called to free its swap space. +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. */ -int try_to_free_swap(struct page *page) +bool folio_free_swap(struct folio *folio) { - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) - return 0; + return false; if (folio_test_writeback(folio)) - return 0; + return false; if (folio_swapped(folio)) - return 0; + return false; /* * Once hibernation has begun to create its image of memory, - * there's a danger that one of the calls to try_to_free_swap() + * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free - * the swap from a page which has already been recorded in the - * image as a clean swapcache page, and then reuse its swap for + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the - * original page might be freed under memory pressure, then + * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) - return 0; + return false; delete_from_swap_cache(folio); folio_set_dirty(folio); - return 1; + return true; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ce6cc74d9eae..9268e64590e4d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2049,7 +2049,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(&folio->page) || folio_test_mlocked(folio))) - try_to_free_swap(&folio->page); + folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); From 3c7636e629b1060f67b1820e9a302d8862bedb58 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:07 +0100 Subject: [PATCH 029/352] mm/swap: convert __read_swap_cache_async() to use a folio Remove a few hidden (and one visible) calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 41afa6d45b239..b1e181fc52687 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -411,7 +411,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool *new_page_allocated) { struct swap_info_struct *si; - struct page *page; + struct folio *folio; void *shadow = NULL; *new_page_allocated = false; @@ -426,11 +426,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (page) - return page; + if (folio) + return folio_file_page(folio, swp_offset(entry)); /* * Just skip read ahead for unused swap slot. @@ -448,8 +448,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - page = alloc_page_vma(gfp_mask, vma, addr); - if (!page) + folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + if (!folio) return NULL; /* @@ -459,7 +459,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (!err) break; - put_page(page); + folio_put(folio); if (err != -EEXIST) return NULL; @@ -477,30 +477,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * The swap entry is ours to swap in. Prepare the new page. */ - __SetPageLocked(page); - __SetPageSwapBacked(page); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page_folio(page), shadow); + workingset_refault(folio, shadow); - /* Caller will initiate read into locked page */ - lru_cache_add(page); + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); *new_page_allocated = true; - return page; + return &folio->page; fail_unlock: - put_swap_page(page, entry); - unlock_page(page); - put_page(page); + put_swap_page(&folio->page, entry); + folio_unlock(folio); + folio_put(folio); return NULL; } From 43cf8aad86357d01ec5161ade49881f047341e40 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:08 +0100 Subject: [PATCH 030/352] mm/swap: convert add_to_swap_cache() to take a folio With all callers using folios, we can convert add_to_swap_cache() to take a folio and use it throughout. Link: https://lkml.kernel.org/r/20220902194653.1739778-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- mm/swap.h | 4 ++-- mm/swap_state.c | 34 +++++++++++++++++----------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4113f1b9d4a82..ced76c229b960 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1406,7 +1406,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(&folio->page, swap, + if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); diff --git a/mm/swap.h b/mm/swap.h index 0ffa5b478051a..29e38f3d82d03 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -32,7 +32,7 @@ extern struct address_space *swapper_spaces[]; void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp); void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); @@ -122,7 +122,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp_mask, void **shadowp) { return -1; diff --git a/mm/swap_state.c b/mm/swap_state.c index b1e181fc52687..ecf1accc2fb18 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = thp_nr_pages(page); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); void *old; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapCache(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - page_ref_add(page, nr); - SetPageSwapCache(page); + folio_ref_add(folio, nr); + folio_set_swapcache(folio); do { xas_lock_irq(&xas); @@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { - VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); old = xas_load(&xas); if (xa_is_value(old)) { if (shadowp) *shadowp = old; } - set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + set_page_private(folio_page(folio, i), entry.val + i); + xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -127,8 +127,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, if (!xas_error(&xas)) return 0; - ClearPageSwapCache(page); - page_ref_sub(page, nr); + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); return xas_error(&xas); } @@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio) /* * Add it to the swap cache. */ - err = add_to_swap_cache(&folio->page, entry, + err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -484,7 +484,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); From 50eaac08886e56d9dc595aad94d98c80e2890a01 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:09 +0100 Subject: [PATCH 031/352] mm/swap: convert put_swap_page() to put_swap_folio() With all callers now using a folio, we can convert this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swap_slots.c | 2 +- mm/swap_state.c | 6 +++--- mm/swapfile.c | 4 ++-- mm/vmscan.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index dac6308d878e9..42cbef554de68 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -491,7 +491,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); -extern void put_swap_page(struct page *page, swp_entry_t entry); +void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -576,7 +576,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void put_swap_page(struct page *page, swp_entry_t swp) +static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } diff --git a/mm/shmem.c b/mm/shmem.c index ced76c229b960..56cabf9bb947b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1424,7 +1424,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 10b94d64cc257..0bec1f705f8e0 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -343,7 +343,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio) get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(folio, entry)) { - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); entry.val = 0; } return entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index ecf1accc2fb18..ea354efd37356 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio) return true; fail: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); return false; } @@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio) __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return &folio->page; fail_unlock: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_unlock(folio); folio_put(folio); return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index f2a446799a393..aafe739dc2a62 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1332,7 +1332,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void put_swap_page(struct page *page, swp_entry_t entry) +void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1341,7 +1341,7 @@ void put_swap_page(struct page *page, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(thp_nr_pages(page)); + int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9268e64590e4d..1707e3bfcfe42 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1352,7 +1352,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); From 5094ca1d4ac6183f72f5507e8ae61a272a54a5dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:10 +0100 Subject: [PATCH 032/352] mm: convert do_swap_page() to use a folio Removes quite a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 57 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e49faa0a1f9a6..04f54abdf9d2d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,6 +3724,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; @@ -3768,19 +3769,23 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; + if (page) + folio = page_folio(page); if (!page) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); + page = &folio->page; + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); if (mem_cgroup_swapin_charge_page(page, - vma->vm_mm, GFP_KERNEL, entry)) { + vma->vm_mm, GFP_KERNEL, + entry)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3788,20 +3793,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page_folio(page), - shadow); + workingset_refault(folio, shadow); - lru_cache_add(page); + folio_add_lru(folio); /* To provide entry to swap_readpage() */ - set_page_private(page, entry.val); + folio_set_swap_entry(folio, entry); swap_readpage(page, true, NULL); - set_page_private(page, 0); + folio->private = NULL; } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = page; + if (page) + folio = page_folio(page); } if (!page) { @@ -3844,7 +3850,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!PageSwapCache(page) || + if (unlikely(!folio_test_swapcache(folio) || page_private(page) != entry.val)) goto out_page; @@ -3859,6 +3865,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = swapcache; goto out_page; } + folio = page_folio(page); /* * If we want to map a page that's in the swapcache writable, we @@ -3867,7 +3874,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * pagevecs if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && - !PageKsm(page) && !PageLRU(page)) + !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3881,7 +3888,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } @@ -3894,14 +3901,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ - BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); - BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); + BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ - if (!PageKsm(page)) { + if (!folio_test_ksm(folio)) { /* * Note that pte_swp_exclusive() == false for architectures * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. @@ -3913,7 +3920,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache -> certainly exclusive. */ exclusive = true; - } else if (exclusive && PageWriteback(page) && + } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support @@ -3956,7 +3963,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ - if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { + if (!folio_test_ksm(folio) && + (exclusive || folio_ref_count(folio) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; @@ -3976,16 +3984,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } - VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page))); + VM_BUG_ON(!folio_test_anon(folio) || + (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - unlock_page(page); + folio_unlock(folio); if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused @@ -4017,9 +4026,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: - unlock_page(page); + folio_unlock(folio); out_release: - put_page(page); + folio_put(folio); if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); From be64d3fb8a685b487c81c273e37f995d0d002ef7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:11 +0100 Subject: [PATCH 033/352] mm: convert do_swap_page()'s swapcache variable to a folio The 'swapcache' variable is used to track whether the page is from the swapcache or not. It can do this equally well by being the folio of the page rather than the page itself, and this saves a number of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 04f54abdf9d2d..1e114438f6064 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,8 +3724,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - struct page *page = NULL, *swapcache; + struct folio *swapcache, *folio = NULL; + struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool exclusive = false; @@ -3768,11 +3768,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; page = lookup_swap_cache(entry, vma, vmf->address); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; - if (!page) { + if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ @@ -3805,12 +3805,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; } - if (!page) { + if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -3862,7 +3862,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; - page = swapcache; goto out_page; } folio = page_folio(page); @@ -3873,7 +3872,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner. Try removing the extra reference from the local LRU * pagevecs if required. */ - if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3914,7 +3913,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. */ exclusive = pte_swp_exclusive(vmf->orig_pte); - if (page != swapcache) { + if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. @@ -3982,7 +3981,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte; /* ksm created a completely new copy */ - if (unlikely(page != swapcache && swapcache)) { + if (unlikely(folio != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { @@ -3995,7 +3994,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); folio_unlock(folio); - if (page != swapcache && swapcache) { + if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -4004,8 +4003,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * so that the swap count won't change under a * parallel locked swapcache. */ - unlock_page(swapcache); - put_page(swapcache); + folio_unlock(swapcache); + folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { @@ -4029,9 +4028,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_unlock(folio); out_release: folio_put(folio); - if (page != swapcache && swapcache) { - unlock_page(swapcache); - put_page(swapcache); + if (folio != swapcache && swapcache) { + folio_unlock(swapcache); + folio_put(swapcache); } if (si) put_swap_device(si); From dc531e11e98b909a7b44cbea232564479a2e65e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:12 +0100 Subject: [PATCH 034/352] memcg: convert mem_cgroup_swapin_charge_page() to mem_cgroup_swapin_charge_folio() All callers now have a folio, so pass it in here and remove an unnecessary call to page_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 13 ++++++------- mm/memory.c | 2 +- mm/swap_state.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9590a063b2fdd..12f63dbb6a8fd 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -688,7 +688,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1254,7 +1254,7 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } -static inline int mem_cgroup_swapin_charge_page(struct page *page, +static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e804056422db0..621b4472c4094 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6844,21 +6844,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; diff --git a/mm/memory.c b/mm/memory.c index 1e114438f6064..b36b177e0ea91 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3783,7 +3783,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; diff --git a/mm/swap_state.c b/mm/swap_state.c index ea354efd37356..a7e0438902dd1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -480,7 +480,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ From d8b6bf94606a220dae89acbe4c3554c90a94d9c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:13 +0100 Subject: [PATCH 035/352] shmem: convert shmem_mfill_atomic_pte() to use a folio Assert that this is a single-page folio as there are several assumptions in here that it's exactly PAGE_SIZE bytes large. Saves several calls to compound_head() and removes the last caller of shmem_alloc_page(). Link: https://lkml.kernel.org/r/20220902194653.1739778-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 56cabf9bb947b..8754e2b4800a1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2374,12 +2374,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -2395,7 +2389,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; - struct page *page; int ret; pgoff_t max_off; @@ -2414,53 +2407,53 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!*pagep) { ret = -ENOMEM; - page = shmem_alloc_page(gfp, info, pgoff); - if (!page) + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) goto out_unacct_blocks; if (!zeropage) { /* COPY */ - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_folio(folio, 0); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = page; + *pagep = &folio->page; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } - flush_dcache_page(page); + flush_dcache_folio(folio); } else { /* ZEROPAGE */ - clear_user_highpage(page, dst_addr); + clear_user_highpage(&folio->page, dst_addr); } } else { - page = *pagep; + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *pagep = NULL; } - VM_BUG_ON(PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; - folio = page_folio(page); ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - page, true, wp_copy); + &folio->page, true, wp_copy); if (ret) goto out_delete_from_cache; @@ -2470,13 +2463,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - unlock_page(page); + folio_unlock(folio); return 0; out_delete_from_cache: - delete_from_page_cache(page); + filemap_remove_folio(folio); out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; From aae636bae5a70398632c744d5acc26700b0db192 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:14 +0100 Subject: [PATCH 036/352] shmem: convert shmem_replace_page() to shmem_replace_folio() The caller has a folio, so convert the calling convention and rename the function. Link: https://lkml.kernel.org/r/20220902194653.1739778-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8754e2b4800a1..2bb6f5cfdc111 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1608,7 +1608,7 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct folio *old, *new; @@ -1617,7 +1617,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, pgoff_t swap_index; int error; - old = page_folio(*pagep); + old = *foliop; entry = folio_swap_entry(old); swap_index = swp_offset(entry); swap_mapping = swap_address_space(entry); @@ -1666,7 +1666,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, old = new; } else { folio_add_lru(new); - *pagep = &new->page; + *foliop = new; } folio_clear_swapcache(old); @@ -1772,8 +1772,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(swap, folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - folio = page_folio(page); + error = shmem_replace_folio(&folio, gfp, info, index); if (error) goto failed; } From c7df46fe092705956dd3e56e475e5e4aff53a8b1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:15 +0100 Subject: [PATCH 037/352] swap: add swap_cache_get_folio() Convert lookup_swap_cache() into swap_cache_get_folio() and add a lookup_swap_cache() wrapper around it. Link: https://lkml.kernel.org/r/20220902194653.1739778-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.h | 2 ++ mm/swap_state.c | 32 +++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 29e38f3d82d03..2d6fe6c44ad90 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -39,6 +39,8 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); diff --git a/mm/swap_state.c b/mm/swap_state.c index a7e0438902dd1..b96bf4ec8b5b1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,24 +317,24 @@ static inline bool swap_use_vma_readahead(void) } /* - * Lookup a swap entry in the swap cache. A found page will be returned + * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the page + * lock getting page table operations atomic even if we drop the folio * lock before returning. */ -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct folio *folio; struct swap_info_struct *si; si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (page) { + if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -342,10 +342,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ - if (unlikely(PageTransCompound(page))) - return page; + if (unlikely(folio_test_large(folio))) + return folio; - readahead = TestClearPageReadahead(page); + readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; @@ -366,7 +366,17 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, } } - return page; + return folio; +} + +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = swap_cache_get_folio(entry, vma, addr); + + if (!folio) + return NULL; + return folio_file_page(folio, swp_offset(entry)); } /** From be24b11d90e6bb12d924c0d144975c6e1e39f0c6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 3 Sep 2022 12:35:09 -0700 Subject: [PATCH 038/352] swap-add-swap_cache_get_folio-fix add CONFIG_SWAP=n stub for swap_cache_get_folio() Reported-by: kernel test robot Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- mm/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/swap.h b/mm/swap.h index 2d6fe6c44ad90..ccd8d9a9ad36d 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -101,6 +101,12 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} + static inline struct page *lookup_swap_cache(swp_entry_t swp, struct vm_area_struct *vma, unsigned long addr) From 907b7cd80ecccdc8394f1d871094c685fe9bfbb7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:16 +0100 Subject: [PATCH 039/352] shmem: eliminate struct page from shmem_swapin_folio() Convert shmem_swapin() to return a folio and use swap_cache_get_folio(), removing all uses of struct page in this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2bb6f5cfdc111..b685acd9f1495 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1486,7 +1486,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) mpol_cond_put(vma->vm_policy); } -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; @@ -1499,7 +1499,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); - return page; + if (!page) + return NULL; + return page_folio(page); } /* @@ -1721,7 +1723,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1734,8 +1735,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -1743,13 +1744,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { error = -ENOMEM; goto failed; } } - folio = page_folio(page); /* We have to do this with folio locked to prevent races */ folio_lock(folio); From bc2aa6da5207924358228a50375bf5b4806721d5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:17 +0100 Subject: [PATCH 040/352] shmem: convert shmem_getpage_gfp() to shmem_get_folio_gfp() Add a shmem_getpage_gfp() wrapper for compatibility with current users. Link: https://lkml.kernel.org/r/20220902194653.1739778-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 70 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b685acd9f1495..89536091928f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -139,17 +139,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type); - -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -1595,7 +1584,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -1812,7 +1801,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap @@ -1821,10 +1810,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1864,7 +1853,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (error == -EEXIST) goto repeat; - *pagep = &folio->page; + *foliop = folio; return error; } @@ -1874,7 +1863,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; - /* fallocated page */ + /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); @@ -1882,10 +1871,10 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, } /* - * SGP_READ: succeed on hole, with NULL page, letting caller zero. - * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ - *pagep = NULL; + *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) @@ -1918,7 +1907,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (error != -ENOSPC) goto unlock; /* - * Try to reclaim some space by splitting a huge page + * Try to reclaim some space by splitting a large folio * beyond i_size on the filesystem. */ while (retry--) { @@ -1954,9 +1943,9 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { + folio_next_index(folio) - 1) { /* - * Part of the huge page is beyond i_size: subject + * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); @@ -1973,14 +1962,14 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, } /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { @@ -2006,7 +1995,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, goto unlock; } out: - *pagep = folio_page(folio, index - hindex); + *foliop = folio; return 0; /* @@ -2036,6 +2025,29 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; } +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, vm_fault_t *fault_type) +{ + struct folio *folio = NULL; + int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, + vmf, fault_type); + + if (folio) + *pagep = folio_file_page(folio, index); + else + *pagep = NULL; + return ret; +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From fc1a10397799eb1c2f4422c20f794a6aedd5fb16 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:18 +0100 Subject: [PATCH 041/352] shmem: convert shmem_fault() to use shmem_get_folio_gfp() No particular advantage for this function, but necessary to remove shmem_getpage_gfp(). Link: https://lkml.kernel.org/r/20220902194653.1739778-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 89536091928f2..59184bf6731f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2065,6 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2127,10 +2128,11 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } From 2d6cc54a7c2d5797e86a386351e7a6149fcd1a72 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 23 Sep 2022 17:14:38 -0700 Subject: [PATCH 042/352] shmem-convert-shmem_fault-to-use-shmem_get_folio_gfp-fix Link: https://lkml.kernel.org/r/7693a84-bdc2-27b5-2695-d0fe8566571f@google.com Cc: Axel Rasmussen Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Xu Cc: Tom Rix Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/shmem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 59184bf6731f2..154432dc847b7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2065,7 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - struct folio *folio; + struct folio *folio = NULL; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2132,7 +2132,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) gfp, vma, vmf, &ret); if (err) return vmf_error(err); - vmf->page = folio_file_page(folio, vmf->pgoff); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } From 72199b81e53ff1611f5f39df5717ebf26175a7d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:19 +0100 Subject: [PATCH 043/352] shmem: convert shmem_read_mapping_page_gfp() to use shmem_get_folio_gfp() Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 154432dc847b7..c3e2a65a65fc2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4270,18 +4270,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; + struct folio *folio; struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) return ERR_PTR(error); - unlock_page(page); + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); return ERR_PTR(-EIO); } From 4de9c9391f5c4d0625a2f6e74c5fcd10eb2d350e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:20 +0100 Subject: [PATCH 044/352] shmem: add shmem_get_folio() With no remaining callers of shmem_getpage_gfp(), add shmem_get_folio() and reimplement shmem_getpage() as a call to shmem_get_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 23 ++++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff0b990de83d4..f4bd50b08a915 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,6 +113,8 @@ enum sgp_type { extern int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index c3e2a65a65fc2..32afc8039e660 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2025,14 +2025,18 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return error; } -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type) +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) { struct folio *folio = NULL; - int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, - vmf, fault_type); + int ret = shmem_get_folio(inode, index, &folio, sgp); if (folio) *pagep = folio_file_page(folio, index); @@ -2041,13 +2045,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return ret; } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From 793ac49fea0f53ca33867fa914c3ce9170a7d0dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:21 +0100 Subject: [PATCH 045/352] shmem: convert shmem_get_partial_folio() to use shmem_get_folio() Get rid of an unnecessary folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 32afc8039e660..772a30593fcca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -874,10 +874,9 @@ void shmem_unlock_mapping(struct address_space *mapping) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; - struct page *page; /* - * At first avoid shmem_getpage(,,,SGP_READ): that fails + * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated pages as holes. */ folio = __filemap_get_folio(inode->i_mapping, index, @@ -888,9 +887,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * But read a page back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ - page = NULL; - shmem_getpage(inode, index, &page, SGP_READ); - return page ? page_folio(page) : NULL; + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; } /* From 91bc56929c6f36e08318efa607987caf7b47b29d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:22 +0100 Subject: [PATCH 046/352] shmem: convert shmem_write_begin() to use shmem_get_folio() Use a folio throughout this function, saving a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 772a30593fcca..c69b53602a1d8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2498,6 +2498,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ @@ -2509,14 +2510,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); if (ret) return ret; + *pagep = folio_file_page(folio, index); if (PageHWPoison(*pagep)) { - unlock_page(*pagep); - put_page(*pagep); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; return -EIO; } From 71af46a0d0011b52ba11a7a065616a6697150bed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:23 +0100 Subject: [PATCH 047/352] shmem: convert shmem_file_read_iter() to use shmem_get_folio() Use a folio throughout, saving five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c69b53602a1d8..0f81193128470 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2577,6 +2577,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) offset = *ppos & ~PAGE_MASK; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; pgoff_t end_index; unsigned long nr, ret; @@ -2591,17 +2592,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) { - unlock_page(page); + if (folio) { + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); error = -EIO; break; } @@ -2617,14 +2619,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index == end_index) { nr = i_size & ~PAGE_MASK; if (nr <= offset) { - if (page) - put_page(page); + if (folio) + folio_put(folio); break; } } nr -= offset; - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -2636,13 +2638,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Mark the page accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ ret = copy_page_to_iter(page, offset, nr, to); - put_page(page); + folio_put(folio); } else if (user_backed_iter(to)) { /* From a920a5cb7d764d16824cd1058eb0df219dd36237 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:24 +0100 Subject: [PATCH 048/352] shmem: convert shmem_fallocate() to use a folio Call shmem_get_folio() and use the folio APIs instead of the page APIs. Saves several calls to compound_head() and removes assumptions about the size of a large folio. Link: https://lkml.kernel.org/r/20220902194653.1739778-29-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0f81193128470..c2016a7cfc296 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2787,7 +2787,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, info->fallocend = end; for (index = start; index < end; ) { - struct page *page; + struct folio *folio; /* * Good, the fallocate(2) manpage permits EINTR: we may have @@ -2798,10 +2798,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC); + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; - /* Remove the !PageUptodate pages we added */ + /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, @@ -2810,37 +2811,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } - index++; /* * Here is a more important optimization than it appears: - * a second SGP_FALLOC on the same huge page will clear it, - * making it PageUptodate and un-undoable if we fail later. + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. */ - if (PageTransCompound(page)) { - index = round_up(index, HPAGE_PMD_NR); - /* Beware 32-bit wraparound */ - if (!index) - index--; - } + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } From 5b6d9b36c4e80a70ded2cdbaada92d7e1645b136 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:25 +0100 Subject: [PATCH 049/352] shmem: convert shmem_symlink() to use a folio While symlinks will always be < PAGE_SIZE, using the folio APIs gets rid of unnecessary calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-30-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c2016a7cfc296..4948ceffcc9fd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3093,7 +3093,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, int error; int len; struct inode *inode; - struct page *page; + struct folio *folio; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3121,18 +3121,18 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) { iput(inode); return error; } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - memcpy(page_address(page), symname, len); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - put_page(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); From dbadb244d2107b020804326e44a769da355887c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:26 +0100 Subject: [PATCH 050/352] shmem: convert shmem_get_link() to use a folio Symlinks will never use a large folio, but using the folio API removes a lot of unnecessary folio->page->folio conversions. Link: https://lkml.kernel.org/r/20220902194653.1739778-31-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4948ceffcc9fd..e6e934adeed7f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3143,40 +3143,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, static void shmem_put_link(void *arg) { - mark_page_accessed(arg); - put_page(arg); + folio_mark_accessed(arg); + folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page = NULL; + struct folio *folio = NULL; int error; + if (!dentry) { - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page) || - !PageUptodate(page)) { - put_page(page); + if (PageHWPoison(&folio->page) || + !folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); - if (!page) + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page)) { - unlock_page(page); - put_page(page); + if (PageHWPoison(&folio->page)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ECHILD); } - unlock_page(page); + folio_unlock(folio); } - set_delayed_call(done, shmem_put_link, page); - return page_address(page); + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR From 2832df241e3ba43563dd829da87eaffc3854089f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:27 +0100 Subject: [PATCH 051/352] khugepaged: call shmem_get_folio() shmem_getpage() is being removed, so call its replacement and find the precise page ourselves. Link: https://lkml.kernel.org/r/20220902194653.1739778-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++++-- mm/shmem.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1e59fe7bfae36..57af2c841b410 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1647,13 +1647,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, } if (xa_is_value(page) || !PageUptodate(page)) { + struct folio *folio; + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOALLOC)) { + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } + page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); diff --git a/mm/shmem.c b/mm/shmem.c index e6e934adeed7f..909149b25d98b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3158,7 +3158,7 @@ static const char *shmem_get_link(struct dentry *dentry, folio = filemap_get_folio(inode->i_mapping, 0); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page) || + if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); @@ -3169,7 +3169,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page)) { + if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); From 6139672747420016569d01c5e6c659a16f30737d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:28 +0100 Subject: [PATCH 052/352] userfaultfd: convert mcontinue_atomic_pte() to use a folio shmem_getpage() is being replaced by shmem_get_folio() so use a folio throughout this function. Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-33-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7327b2573f7c2..9c035be2148bb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -243,20 +243,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find page. */ + ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; - if (!page) { + if (!folio) { ret = -EFAULT; goto out; } + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; @@ -267,13 +269,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } From 7540c437516e09e7a54b25428fc91c5455f2a649 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:29 +0100 Subject: [PATCH 053/352] shmem: remove shmem_getpage() With all callers removed, remove this wrapper function. The flags are now mysteriously called SGP, but I think we can live with that. Link: https://lkml.kernel.org/r/20220902194653.1739778-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 +--- mm/shmem.c | 15 +-------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f4bd50b08a915..f24071e3c826e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,7 +102,7 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); -/* Flag allocation requirements to shmem_getpage */ +/* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ @@ -111,8 +111,6 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -extern int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp); int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); diff --git a/mm/shmem.c b/mm/shmem.c index 909149b25d98b..3d0b729fcc5ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -179,7 +179,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_block(unsigned long flags, long pages) @@ -2031,19 +2031,6 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - struct folio *folio = NULL; - int ret = shmem_get_folio(inode, index, &folio, sgp); - - if (folio) - *pagep = folio_file_page(folio, index); - else - *pagep = NULL; - return ret; -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From d8c8104f091277bb48d6f5dd2e669dcf60abddb6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:30 +0100 Subject: [PATCH 054/352] swapfile: convert try_to_unuse() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-35-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index aafe739dc2a62..23cdbe8e47cfb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2032,7 +2032,7 @@ static int try_to_unuse(unsigned int type) struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int i; @@ -2082,21 +2082,21 @@ static int try_to_unuse(unsigned int type) (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - page = find_get_page(swap_address_space(entry), i); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) continue; /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock. The page + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But - * that is okay, try_to_free_swap() only removes stale pages. + * that is okay, folio_free_swap() only removes stale folios. */ - lock_page(page); - wait_on_page_writeback(page); - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); } /* From 383973592e0f3d7582fab1f6785e6eacbeec4940 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:31 +0100 Subject: [PATCH 055/352] swapfile: convert __try_to_reclaim_swap() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-36-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 23cdbe8e47cfb..e3e1bd3d20b17 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -132,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct page *page; + struct folio *folio; int ret = 0; - page = find_get_page(swap_address_space(entry), offset); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use try_to_free_swap() with explicit lock_page() + * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (trylock_page(page)) { + if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) - ret = try_to_free_swap(page); - unlock_page(page); + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ret = folio_free_swap(folio); + folio_unlock(folio); } - put_page(page); + folio_put(folio); return ret; } From 695956c6567f528f9bbada76d30e0b0b7d850b87 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:32 +0100 Subject: [PATCH 056/352] swapfile: convert unuse_pte_range() to use a folio Delay fetching the precise page from the folio until we're in unuse_pte(). Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-37-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e3e1bd3d20b17..3820b5ab64d94 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1758,8 +1758,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct folio *folio) { + struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; @@ -1831,17 +1832,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - struct page *page; swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; - unsigned long offset; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { + struct folio *folio; + unsigned long offset; + if (!is_swap_pte(*pte)) continue; @@ -1852,8 +1854,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; - page = lookup_swap_cache(entry, vma, addr); - if (!page) { + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1863,25 +1866,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (page) + folio = page_folio(page); } - if (!page) { + if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } - lock_page(page); - wait_on_page_writeback(page); - ret = unuse_pte(vma, pmd, addr, entry, page); + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); From 51f17c8ec8c8ccc62e637b915c11c2ce9b3629d1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:33 +0100 Subject: [PATCH 057/352] mm: convert do_swap_page() to use swap_cache_get_folio() Saves a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-38-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b36b177e0ea91..0018df3f0cc24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3767,9 +3767,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - page = lookup_swap_cache(entry, vma, vmf->address); - if (page) - folio = page_folio(page); + folio = swap_cache_get_folio(entry, vma, vmf->address); + if (folio) + page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { From f768ffd043bfa33b50025fadb3aaa889e6020cea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:34 +0100 Subject: [PATCH 058/352] mm: remove lookup_swap_cache() All callers have now been converted to swap_cache_get_folio(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- mm/swap.h | 10 ---------- mm/swap_state.c | 12 +----------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 621b4472c4094..9863fb5889729 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5569,7 +5569,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); diff --git a/mm/swap.h b/mm/swap.h index ccd8d9a9ad36d..cc08c459c6190 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,9 +41,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -107,13 +104,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) -{ - return NULL; -} - static inline struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { diff --git a/mm/swap_state.c b/mm/swap_state.c index b96bf4ec8b5b1..4af135a7b53c4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -369,16 +369,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) -{ - struct folio *folio = swap_cache_get_folio(entry, vma, addr); - - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); -} - /** * find_get_incore_page - Find and get a page from the page or swap caches. * @mapping: The address_space to search. @@ -430,7 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, int err; /* * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling + * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ si = get_swap_device(entry); From 509f1281d786bf826d63975d10783ba881086d44 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:35 +0100 Subject: [PATCH 059/352] swap_state: convert free_swap_cache() to use a folio Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-40-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4af135a7b53c4..438d0676c5be2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, /* * If we are the only user, then try to free up the swap cache. * - * Its ok to check for PageSwapCache without the page lock + * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside - * try_to_free_swap() _with_ the lock. + * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); } } From 6b0b0152b06759bdda582e5a678d02165eb435c8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:36 +0100 Subject: [PATCH 060/352] swap: convert swap_writepage() to use a folio Removes many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-41-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_io.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index fc6b3fb1f7c59..2af34dd8fa4db 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -180,29 +180,30 @@ int generic_swapfile_activate(struct swap_info_struct *sis, */ int swap_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret = 0; - if (try_to_free_swap(page)) { - unlock_page(page); + if (folio_free_swap(folio)) { + folio_unlock(folio); goto out; } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(page); + ret = arch_prepare_to_swap(&folio->page); if (ret) { - set_page_dirty(page); - unlock_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); goto out; } - if (frontswap_store(page) == 0) { - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + if (frontswap_store(&folio->page) == 0) { + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); goto out; } - ret = __swap_writepage(page, wbc); + ret = __swap_writepage(&folio->page, wbc); out: return ret; } From 61a04cc444674c45b0455665461c3aa41a881e54 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:37 +0100 Subject: [PATCH 061/352] mm: convert do_wp_page() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-42-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0018df3f0cc24..2f1397b7c77dc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3368,6 +3368,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); @@ -3414,48 +3415,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page)) { - struct page *page = vmf->page; - + folio = page_folio(vmf->page); + if (folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. */ - if (PageAnonExclusive(page)) + if (PageAnonExclusive(vmf->page)) goto reuse; /* - * We have to verify under page lock: these early checks are - * just an optimization to avoid locking the page and freeing + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * - * PageKsm() doesn't necessarily raise the page refcount. + * KSM doesn't necessarily raise the folio refcount. */ - if (PageKsm(page) || page_count(page) > 3) + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) goto copy; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) /* * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to PageLRU() pages. + * remote LRU pagevecs or references to LRU folios. */ lru_add_drain(); - if (page_count(page) > 1 + PageSwapCache(page)) + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) goto copy; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto copy; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (PageKsm(page) || page_count(page) != 1) { - unlock_page(page); + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); goto copy; } /* - * Ok, we've got the only page reference from our mapping - * and the page is locked, it's dark out, and we're wearing + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(page, vma); - unlock_page(page); + page_move_anon_rmap(vmf->page, vma); + folio_unlock(folio); reuse: if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); From 53a4d6dcd595ae2f95316c91d8a45e5301688521 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:38 +0100 Subject: [PATCH 062/352] huge_memory: convert do_huge_pmd_wp_page() to use a folio Removes many calls to compound_head(). Does not remove the assumption that a folio may not be larger than a PMD. Link: https://lkml.kernel.org/r/20220902194653.1739778-43-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 84bf1d5f6b7e8..1181e623bf5b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1305,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; @@ -1326,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } page = pmd_page(orig_pmd); + folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; - if (!trylock_page(page)) { - get_page(page); + if (!folio_trylock(folio)) { + folio_get(folio); spin_unlock(vmf->ptl); - lock_page(page); + folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } - put_page(page); + folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { - unlock_page(page); + folio_unlock(folio); goto reuse; } /* - * See do_wp_page(): we can only reuse the page exclusively if there are - * no additional references. Note that we always drain the LRU - * pagevecs immediately after adding a THP. + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. */ - if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (page_count(page) == 1) { + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { pmd_t entry; page_move_anon_rmap(page, vma); - unlock_page(page); + folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); @@ -1380,7 +1383,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } unlock_fallback: - unlock_page(page); + folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); From 8ba254318d954bedd8aa8ede21217f38740c8618 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:39 +0100 Subject: [PATCH 063/352] madvise: convert madvise_free_pte_range() to use a folio Saves a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-44-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/madvise.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index a3fc4cd32ed36..2baa93ca23109 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -601,6 +601,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; + struct folio *folio; struct page *page; int nr_swap = 0; unsigned long next; @@ -645,56 +646,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, page = vm_normal_page(vma, addr, ptent); if (!page || is_zone_device_page(page)) continue; + folio = page_folio(page); /* - * If pmd isn't transhuge but the page is THP and + * If pmd isn't transhuge but the folio is large and * is owned by only this process, split it and * deactivate all pages. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) goto out; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); goto out; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; } - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (PageSwapCache(page) || PageDirty(page)) { - if (!trylock_page(page)) + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) continue; /* - * If page is shared with others, we couldn't clear - * PG_dirty of the page. + * If folio is shared with others, we mustn't clear + * the folio's dirty flag. */ - if (page_mapcount(page) != 1) { - unlock_page(page); + if (folio_mapcount(folio) != 1) { + folio_unlock(folio); continue; } - if (PageSwapCache(page) && !try_to_free_swap(page)) { - unlock_page(page); + if (folio_test_swapcache(folio) && + !folio_free_swap(folio)) { + folio_unlock(folio); continue; } - ClearPageDirty(page); - unlock_page(page); + folio_clear_dirty(folio); + folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { @@ -712,7 +713,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); } out: if (nr_swap) { From af0cde8eca96bfda1bbfcbdd2deb01a464f55523 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:40 +0100 Subject: [PATCH 064/352] uprobes: use folios more widely in __replace_page() Remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-45-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 401bc2d24ce06..70375c7c0c4b9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ -#include /* try_to_free_swap */ +#include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,9 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -169,8 +170,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -186,7 +187,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +199,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } From ba3beec9a084b8b9185484f686f65720a07c5b2f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:41 +0100 Subject: [PATCH 065/352] ksm: use a folio in replace_page() Replace three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20220902194653.1739778-46-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/ksm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index c3edb5836a441..c19fcca9bc03d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1110,6 +1110,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; pmd_t *pmd; pmd_t pmde; pte_t *ptep; @@ -1178,10 +1179,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + folio = page_folio(page); page_remove_rmap(page, vma, false); - if (!page_mapped(page)) - try_to_free_swap(page); - put_page(page); + if (!folio_mapped(folio)) + folio_free_swap(folio); + folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; From 58d41f6c54c8573e6d4901399b5df3bebab0e73e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:42 +0100 Subject: [PATCH 066/352] mm: convert do_swap_page() to use folio_free_swap() Also convert should_try_to_free_swap() to use a folio. This removes a few calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-47-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2f1397b7c77dc..b8e4dae18ac15 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3641,14 +3641,14 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } -static inline bool should_try_to_free_swap(struct page *page, +static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || - PageMlocked(page)) + if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we @@ -3656,8 +3656,8 @@ static inline bool should_try_to_free_swap(struct page *page, * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ - return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && - page_count(page) == 2; + return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && + folio_ref_count(folio) == 2; } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -3949,8 +3949,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * yet. */ swap_free(entry); - if (should_try_to_free_swap(page, vma, vmf->flags)) - try_to_free_swap(page); + if (should_try_to_free_swap(folio, vma, vmf->flags)) + folio_free_swap(folio); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); From 33df0f947673fff293fbb1fe6ecf6e60ee3329e4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: [PATCH 067/352] memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/memcontrol.c | 6 +++--- mm/memory.c | 2 +- mm/swapfile.c | 2 +- mm/vmscan.c | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42cbef554de68..d8bd6401c3e7d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -692,7 +692,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); -extern bool mem_cgroup_swap_full(struct page *page); +extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { @@ -714,7 +714,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return get_nr_swap_pages(); } -static inline bool mem_cgroup_swap_full(struct page *page) +static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9863fb5889729..632402001bca1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7406,18 +7406,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; diff --git a/mm/memory.c b/mm/memory.c index b8e4dae18ac15..2f1a6da7f1e65 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3647,7 +3647,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, { if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 3820b5ab64d94..4efcfe34e45b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -148,7 +148,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1707e3bfcfe42..c5a4bff11da69 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2047,8 +2047,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && - (mem_cgroup_swap_full(&folio->page) || - folio_test_mlocked(folio))) + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { From 658ecdf278dd3f26137c5ee9ca235cb7adc612da Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:44 +0100 Subject: [PATCH 068/352] mm: remove try_to_free_swap() All callers have now been converted to folio_free_swap() and we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-49-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/folio-compat.c | 7 ------- mm/memory.c | 2 +- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d8bd6401c3e7d..fc8d98660326f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -595,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int try_to_free_swap(struct page *page) -{ - return 0; -} - static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 06d47f00609b5..e1e23b4947d73 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,10 +146,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_SWAP -int try_to_free_swap(struct page *page) -{ - return folio_free_swap(page_folio(page)); -} -#endif diff --git a/mm/memory.c b/mm/memory.c index 2f1a6da7f1e65..6e568f190e7a8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3844,7 +3844,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (swapcache) { /* - * Make sure try_to_free_swap or swapoff did not release the + * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not From 533fdb4fe496843d4ab71fb63741a5deb1a36eba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:45 +0100 Subject: [PATCH 069/352] rmap: convert page_move_anon_rmap() to use a folio Removes one call to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-50-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2ff17b9aabd9b..d44ff516a2089 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1099,22 +1099,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, */ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; - struct page *subpage = page; - - page = compound_head(page); + void *anon_vma = vma->anon_vma; + struct folio *folio = page_folio(page); - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - SetPageAnonExclusive(subpage); + WRITE_ONCE(folio->mapping, anon_vma); + SetPageAnonExclusive(page); } /** From b7770eb5a12d9f5af3f658f40fe231adee606188 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:46 +0100 Subject: [PATCH 070/352] migrate: convert __unmap_and_move() to use folios Removes a lot of calls to compound_head(). Also remove a VM_BUG_ON that can never trigger as the PageAnon bit is the bottom bit of page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-51-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 75 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index eb594b0db8060..1ea149f14f849 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -993,17 +993,15 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, return rc; } -static int __unmap_and_move(struct page *page, struct page *newpage, +static int __unmap_and_move(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { - struct folio *folio = page_folio(page); - struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(page); + bool is_lru = !__PageMovable(&src->page); - if (!trylock_page(page)) { + if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) goto out; @@ -1023,10 +1021,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (current->flags & PF_MEMALLOC) goto out; - lock_page(page); + folio_lock(src); } - if (PageWriteback(page)) { + if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, @@ -1043,12 +1041,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!force) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(src); } /* - * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, @@ -1060,22 +1058,22 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) - anon_vma = page_get_anon_vma(page); + if (folio_test_anon(src) && !folio_test_ksm(src)) + anon_vma = page_get_anon_vma(&src->page); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one - * holding a reference to newpage at this point. We used to have a BUG - * here if trylock_page(newpage) fails, but would like to allow for - * cases where there might be a race with the previous use of newpage. + * holding a reference to dst at this point. We used to have a BUG + * here if folio_trylock(dst) fails, but would like to allow for + * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ - if (unlikely(!trylock_page(newpage))) + if (unlikely(!folio_trylock(dst))) goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, folio, mode); + rc = move_to_new_folio(dst, src, mode); goto out_unlock_both; } @@ -1083,7 +1081,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page will + * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory @@ -1091,57 +1089,56 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ - if (!page->mapping) { - VM_BUG_ON_PAGE(PageAnon(page), page); - if (page_has_private(page)) { - try_to_free_buffers(folio); + if (!src->mapping) { + if (folio_test_private(src)) { + try_to_free_buffers(src); goto out_unlock_both; } - } else if (page_mapped(page)) { + } else if (folio_mapped(src)) { /* Establish migration ptes */ - VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, - page); - try_to_migrate(folio, 0); + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); page_was_mapped = true; } - if (!page_mapped(page)) - rc = move_to_new_folio(dst, folio, mode); + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); /* - * When successful, push newpage to LRU immediately: so that if it + * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will - * automatically build up the correct newpage->mlock_count for it. + * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ if (rc == MIGRATEPAGE_SUCCESS) { - lru_cache_add(newpage); + folio_add_lru(dst); if (page_was_mapped) lru_add_drain(); } if (page_was_mapped) - remove_migration_ptes(folio, - rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); out_unlock_both: - unlock_page(newpage); + folio_unlock(dst); out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - unlock_page(page); + folio_unlock(src); out: /* - * If migration is successful, decrease refcount of the newpage, + * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ if (rc == MIGRATEPAGE_SUCCESS) - put_page(newpage); + folio_put(dst); return rc; } @@ -1157,6 +1154,7 @@ static int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason, struct list_head *ret) { + struct folio *dst, *src = page_folio(page); int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; @@ -1174,9 +1172,10 @@ static int unmap_and_move(new_page_t get_new_page, newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; + dst = page_folio(newpage); newpage->private = 0; - rc = __unmap_and_move(page, newpage, force, mode); + rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); From 0d05020f4d3eb411586c07c08a097b880e277a22 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:47 +0100 Subject: [PATCH 071/352] migrate: convert unmap_and_move_huge_page() to use folios Saves several calls to compound_head() and removes a couple of uses of page->lru. Link: https://lkml.kernel.org/r/20220902194653.1739778-52-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1ea149f14f849..c1c2d9d9032b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1263,7 +1263,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!hugepage_migration_supported(page_hstate(hpage))) return -ENOSYS; - if (page_count(hpage) == 1) { + if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ putback_active_hugepage(hpage); return MIGRATEPAGE_SUCCESS; @@ -1274,7 +1274,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; dst = page_folio(new_hpage); - if (!trylock_page(hpage)) { + if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { @@ -1284,29 +1284,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, default: goto out; } - lock_page(hpage); + folio_lock(src); } /* * Check for pages which are in the process of being freed. Without - * page_mapping() set, hugetlbfs specific move page routine will not + * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) { + if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } - if (PageAnon(hpage)) - anon_vma = page_get_anon_vma(hpage); + if (folio_test_anon(src)) + anon_vma = page_get_anon_vma(&src->page); - if (unlikely(!trylock_page(new_hpage))) + if (unlikely(!folio_trylock(dst))) goto put_anon; - if (page_mapped(hpage)) { + if (folio_mapped(src)) { enum ttu_flags ttu = 0; - if (!PageAnon(hpage)) { + if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take @@ -1327,7 +1327,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, i_mmap_unlock_write(mapping); } - if (!page_mapped(hpage)) + if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) @@ -1335,7 +1335,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: - unlock_page(new_hpage); + folio_unlock(dst); put_anon: if (anon_vma) @@ -1347,12 +1347,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } out_unlock: - unlock_page(hpage); + folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); else if (rc != -EAGAIN) - list_move_tail(&hpage->lru, ret); + list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, use From abe41ebe2faff83923edce7bb8bbd36dfc60814e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:48 +0100 Subject: [PATCH 072/352] huge_memory: convert split_huge_page_to_list() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-53-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 49 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1181e623bf5b7..bb8266b099f54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2622,27 +2622,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); - XA_STATE(xas, &head->mapping->i_pages, head->index); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_PAGE(!PageLocked(head), head); - VM_BUG_ON_PAGE(!PageCompound(head), head); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(head); - VM_WARN_ON_ONCE_PAGE(is_hzp, head); + is_hzp = is_huge_zero_page(&folio->page); + VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); if (is_hzp) return -EBUSY; - if (PageWriteback(head)) + if (folio_test_writeback(folio)) return -EBUSY; - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -2651,7 +2650,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(head); + anon_vma = page_get_anon_vma(&folio->page); if (!anon_vma) { ret = -EBUSY; goto out; @@ -2662,7 +2661,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } else { gfp_t gfp; - mapping = head->mapping; + mapping = folio->mapping; /* Truncated ? */ if (!mapping) { @@ -2679,7 +2678,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), gfp); + xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -2693,7 +2692,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but - * head page lock is good enough to serialize the trimming. + * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -2709,38 +2708,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(head); + unmap_page(&folio->page); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* - * Check if the head page is present in page cache. - * We assume all tail are present too, if head is there. + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != head) + if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (page_ref_freeze(head, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(head))) { + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - int nr = thp_nr_pages(head); + int nr = folio_nr_pages(folio); - xas_split(&xas, head, thp_order(head)); - if (PageSwapBacked(head)) { - __mod_lruvec_page_state(head, NR_SHMEM_THPS, + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __mod_lruvec_page_state(head, NR_FILE_THPS, + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } From 5acddfb43b1be4cd930728e2b0a5d92c0a07409b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:49 +0100 Subject: [PATCH 073/352] huge_memory: convert unmap_page() to unmap_folio() Remove a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-54-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb8266b099f54..22949ff6df131 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2355,13 +2355,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void unmap_page(struct page *page) +static void unmap_folio(struct folio *folio) { - struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file @@ -2378,7 +2377,7 @@ static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; - /* If unmap_page() uses try_to_migrate() on file, remove this check */ + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { @@ -2428,7 +2427,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop @@ -2700,7 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* - * Racy check if we can split the page, before unmap_page() will + * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { @@ -2708,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(&folio->page); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); From 7dbb020c13ae6e6386ffcecb7abfbcaa55af1a88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:50 +0100 Subject: [PATCH 074/352] mm: convert page_get_anon_vma() to folio_get_anon_vma() With all callers now passing in a folio, rename the function and convert all callers. Removes a couple of calls to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-55-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 +- mm/migrate.c | 6 +++--- mm/rmap.c | 14 +++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 72b2bcc37f73b..3d56e3712bb2a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -163,7 +163,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *page_get_anon_vma(struct page *page); +struct anon_vma *folio_get_anon_vma(struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 22949ff6df131..36ef79b851958 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2649,7 +2649,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(&folio->page); + anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; diff --git a/mm/migrate.c b/mm/migrate.c index c1c2d9d9032b9..c228afba0963d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1052,14 +1052,14 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * - * Only page_get_anon_vma() understands the subtleties of + * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ if (folio_test_anon(src) && !folio_test_ksm(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (folio_test_anon(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); if (unlikely(!folio_trylock(dst))) goto put_anon; diff --git a/mm/rmap.c b/mm/rmap.c index d44ff516a2089..86511e633fcda 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -486,16 +486,16 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. */ -struct anon_vma *page_get_anon_vma(struct page *page) +struct anon_vma *folio_get_anon_vma(struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long)READ_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; - if (!page_mapped(page)) + if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); @@ -505,13 +505,13 @@ struct anon_vma *page_get_anon_vma(struct page *page) } /* - * If this page is still mapped, then its anon_vma cannot have been + * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; @@ -523,11 +523,11 @@ struct anon_vma *page_get_anon_vma(struct page *page) } /* - * Similar to page_get_anon_vma() except it locks the anon_vma. + * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex + * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, From 35ec2c883b8826236dadfc40a36e4d3f64eaf035 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:51 +0100 Subject: [PATCH 075/352] rmap: remove page_unlock_anon_vma_read() This was simply an alias for anon_vma_unlock_read() since 2011. Link: https://lkml.kernel.org/r/20220902194653.1739778-56-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 ----- mm/memory-failure.c | 2 +- mm/rmap.c | 5 ----- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d56e3712bb2a..ca3e4ba6c58c4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -458,13 +458,8 @@ struct rmap_walk_control { void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); - -/* - * Called by memory-failure.c to kill processes. - */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, struct rmap_walk_control *rwc); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e554f9f583ca9..145bb561ddb3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -529,7 +529,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma_read(av); + anon_vma_unlock_read(av); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 86511e633fcda..0b9264e58d256 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -599,11 +599,6 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, return anon_vma; } -void page_unlock_anon_vma_read(struct anon_vma *anon_vma) -{ - anon_vma_unlock_read(anon_vma); -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is From 47612ee39570132c7ad6ff400d160937c869d5b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:52 +0100 Subject: [PATCH 076/352] uprobes: use new_folio in __replace_page() Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-57-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70375c7c0c4b9..e0a9b945e7bc0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,6 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; @@ -164,8 +165,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -180,9 +181,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); From 7303f99ccf0af1ce7ea329b50a0cda7e30081f74 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:53 +0100 Subject: [PATCH 077/352] mm: convert lock_page_or_retry() to folio_lock_or_retry() Remove a call to compound_head() in each of the two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-58-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 9 +++------ mm/memory.c | 10 +++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09de43e36a64b..32846b6306dbd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page) } /* - * lock_page_or_retry - Lock the page, unless this would block and the + * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, - unsigned int flags) +static inline bool folio_lock_or_retry(struct folio *folio, + struct mm_struct *mm, unsigned int flags) { - struct folio *folio; might_sleep(); - - folio = page_folio(page); return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } diff --git a/mm/memory.c b/mm/memory.c index 6e568f190e7a8..d671ad367d677 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3618,11 +3618,11 @@ EXPORT_SYMBOL(unmap_mapping_range); */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3632,10 +3632,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) - restore_exclusive_pte(vma, page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); - unlock_page(page); + folio_unlock(folio); mmu_notifier_invalidate_range_end(&range); return 0; @@ -3835,7 +3835,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); if (!locked) { ret |= VM_FAULT_RETRY; From 533179abec7cb9fb2d34e416fecacb9bdaf55a54 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 5 Sep 2022 10:09:18 +0800 Subject: [PATCH 078/352] mm/hugetlb.c: remove unnecessary initialization of local `err' Link: https://lkml.kernel.org/r/20220905020918.3552-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9c73b202d260a..6abe7a8d84851 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3718,7 +3718,7 @@ static ssize_t demote_store(struct kobject *kobj, unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; - int err = 0; + int err; int nid; err = kstrtoul(buf, 10, &nr_demote); From a3914d8028579f1aef7d77657d4753e0789a0027 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 4 Sep 2022 22:36:06 +0800 Subject: [PATCH 079/352] mm/damon/sysfs: simplify the judgement whether kdamonds are busy It is unnecessary to get the number of the running kdamond to judge whether kdamonds are busy. Here we can use the damon_sysfs_kdamond_running() helper and return -EBUSY directly when finding a running kdamond. Meanwhile, merging with the judgement that a kdamond has current sysfs command callback request to make the code more clear. Link: https://lkml.kernel.org/r/1662302166-13216-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index bdef9682d0a00..941c3238ed753 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2657,23 +2657,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) kdamonds->kdamonds_arr = NULL; } -static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds, +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, int nr_kdamonds) { - int nr_running_ctxs = 0; int i; for (i = 0; i < nr_kdamonds; i++) { - struct damon_ctx *ctx = kdamonds[i]->damon_ctx; - - if (!ctx) - continue; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - nr_running_ctxs++; - mutex_unlock(&ctx->kdamond_lock); + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; } - return nr_running_ctxs; + + return false; } static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, @@ -2682,15 +2677,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; int err, i; - if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) return -EBUSY; - for (i = 0; i < kdamonds->nr; i++) { - if (damon_sysfs_cmd_request.kdamond == - kdamonds->kdamonds_arr[i]) - return -EBUSY; - } - damon_sysfs_kdamonds_rm_dirs(kdamonds); if (!nr_kdamonds) return 0; From 9be568f5414474e063b6bff90a0f55e452f48161 Mon Sep 17 00:00:00 2001 From: Matthias Goergens Date: Mon, 5 Sep 2022 11:19:04 +0800 Subject: [PATCH 080/352] hugetlb_encode.h: fix undefined behaviour (34 << 26) Left-shifting past the size of your datatype is undefined behaviour in C. The literal 34 gets the type `int`, and that one is not big enough to be left shifted by 26 bits. An `unsigned` is long enough (on any machine that has at least 32 bits for their ints.) For uniformity, we mark all the literals as unsigned. But it's only really needed for HUGETLB_FLAG_ENCODE_16GB. Thanks to Randy Dunlap for an initial review and suggestion. Link: https://lkml.kernel.org/r/20220905031904.150925-1-matthias.goergens@gmail.com Signed-off-by: Matthias Goergens Acked-by: Randy Dunlap Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/uapi/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- tools/include/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index 4f3d5aaa11f53..de687009bfe53 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h index 4f3d5aaa11f53..de687009bfe53 100644 --- a/tools/include/asm-generic/hugetlb_encode.h +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ From 895e35279db29aa549739ea9e868b06aa08a7a5e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 5 Sep 2022 14:45:57 -0700 Subject: [PATCH 081/352] filemap: convert filemap_range_has_writeback() to use folios Removes 3 calls to compound_head(). Link: https://lkml.kernel.org/r/20220905214557.868606-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 68bd70fe71d59..aab125d423b8f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,22 +632,23 @@ bool filemap_range_has_writeback(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; + struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_locked(folio) || + folio_test_writeback(folio)) break; } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); From 603071cf36f671da108f05bd251c45de8b7f4a93 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:16 +0200 Subject: [PATCH 082/352] kasan: check KASAN_NO_FREE_META in __kasan_metadata_size Patch series "kasan: switch tag-based modes to stack ring from per-object metadata", v3. This series makes the tag-based KASAN modes use a ring buffer for storing stack depot handles for alloc/free stack traces for slab objects instead of per-object metadata. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The advantages of this approach over storing stack trace handles in per-object metadata with the tag-based KASAN modes: - Allows to find relevant stack traces for use-after-free bugs without using quarantine for freed memory. (Currently, if the object was reallocated multiple times, the report contains the latest alloc/free stack traces, not necessarily the ones relevant to the buggy allocation.) - Allows to better identify and mark use-after-free bugs, effectively making the CONFIG_KASAN_TAGS_IDENTIFY functionality always-on. - Has fixed memory overhead. The disadvantage: - If the affected object was allocated/freed long before the bug happened and the stack trace events were purged from the stack ring, the report will have no stack traces. Discussion ========== The proposed implementation of the stack ring uses a single ring buffer for the whole kernel. This might lead to contention due to atomic accesses to the ring buffer index on multicore systems. At this point, it is unknown whether the performance impact from this contention would be significant compared to the slowdown introduced by collecting stack traces due to the planned changes to the latter part, see the section below. For now, the proposed implementation is deemed to be good enough, but this might need to be revisited once the stack collection becomes faster. A considered alternative is to keep a separate ring buffer for each CPU and then iterate over all of them when printing a bug report. This approach requires somehow figuring out which of the stack rings has the freshest stack traces for an object if multiple stack rings have them. Further plans ============= This series is a part of an effort to make KASAN stack trace collection suitable for production. This requires stack trace collection to be fast and memory-bounded. The planned steps are: 1. Speed up stack trace collection (potentially, by using SCS; patches on-hold until steps #2 and #3 are completed). 2. Keep stack trace handles in the stack ring (this series). 3. Add a memory-bounded mode to stack depot or provide an alternative memory-bounded stack storage. 4. Potentially, implement stack trace collection sampling to minimize the performance impact. This patch (of 34): __kasan_metadata_size() calculates the size of the redzone for objects in a slab cache. When accounting for presence of kasan_free_meta in the redzone, this function only compares free_meta_offset with 0. But free_meta_offset could also be equal to KASAN_NO_FREE_META, which indicates that kasan_free_meta is not present at all. Add a comparison with KASAN_NO_FREE_META into __kasan_metadata_size(). Link: https://lkml.kernel.org/r/cover.1662411799.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/c7b316d30d90e5947eb8280f4dc78856a49298cf.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 69f583855c8be..f6a6c7d0d8b8f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -224,8 +224,9 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); } struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, From 82a1a4a86ae35be3914a94bed2b93b8b4b6140ca Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:17 +0200 Subject: [PATCH 083/352] kasan: rename kasan_set_*_info to kasan_save_*_info Rename set_alloc_info() and kasan_set_free_info() to save_alloc_info() and kasan_save_free_info(). The new names make more sense. Link: https://lkml.kernel.org/r/9f04777a15cb9d96bf00331da98e021d732fe1c9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 8 ++++---- mm/kasan/generic.c | 2 +- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f6a6c7d0d8b8f..90b6cadd2dac0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -365,7 +365,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_set_free_info(cache, object, tag); + kasan_save_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } @@ -424,7 +424,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; @@ -468,7 +468,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, false); + save_alloc_info(cache, (void *)object, flags, false); return tagged_object; } @@ -514,7 +514,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, true); + save_alloc_info(cache, (void *)object, flags, true); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 437fcc7e77cf2..03a3770cfeaec 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,7 +358,7 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 01c03e45acd42..bf16a74dc0276 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,7 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); -void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 8f48b9502a177..b453a353bc862 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,7 +17,7 @@ #include "kasan.h" -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; From c4e716a9f560e53bd7e08d3f8c60e48543b8e04e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:18 +0200 Subject: [PATCH 084/352] kasan: move is_kmalloc check out of save_alloc_info Move kasan_info.is_kmalloc check out of save_alloc_info(). This is a preparatory change that simplifies the following patches in this series. Link: https://lkml.kernel.org/r/df89f1915b788f9a10319905af6d0202a3b30c30.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 90b6cadd2dac0..6a75237ed308d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,10 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, - gfp_t flags, bool is_kmalloc) +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; - /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ - if (cache->kasan_info.is_kmalloc && !is_kmalloc) - return; - alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); @@ -467,8 +462,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, false); + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -513,8 +508,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, true); + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; From fb50786241ab2a84c498d7fd110b85ffe5cf2b0c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:19 +0200 Subject: [PATCH 085/352] kasan: split save_alloc_info implementations Provide standalone implementations of save_alloc_info() for the Generic and tag-based modes. For now, the implementations are the same, but they will diverge later in the series. Link: https://lkml.kernel.org/r/77f1a078489c1e859aedb5403f772e5e1f7410a0.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 ++----------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 1 + mm/kasan/tags.c | 9 +++++++++ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6a75237ed308d..93e64e1b44131 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -463,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -509,7 +500,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 03a3770cfeaec..98c451a3b01f8 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,6 +358,15 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index bf16a74dc0276..d401fb770f67f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,6 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b453a353bc862..1ba3c8399f72c 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { From f40ab4a2bbdbc803a2b976c570f6abea1111e31d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:20 +0200 Subject: [PATCH 086/352] kasan: drop CONFIG_KASAN_TAGS_IDENTIFY Drop CONFIG_KASAN_TAGS_IDENTIFY and related code to simplify making changes to the reporting code. The dropped functionality will be restored in the following patches in this series. Link: https://lkml.kernel.org/r/4c66ba98eb237e9ed9312c19d423bbcf4ecf88f8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 8 -------- mm/kasan/kasan.h | 12 +----------- mm/kasan/report_tags.c | 28 ---------------------------- mm/kasan/tags.c | 21 ++------------------- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f0973da583e04..ca09b1cf8ee9d 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -167,14 +167,6 @@ config KASAN_STACK as well, as it adds inline-style instrumentation that is run unconditionally. -config KASAN_TAGS_IDENTIFY - bool "Memory corruption type identification" - depends on KASAN_SW_TAGS || KASAN_HW_TAGS - help - Enables best-effort identification of the bug types (use-after-free - or out-of-bounds) at the cost of increased memory consumption. - Only applicable for the tag-based KASAN modes. - config KASAN_VMALLOC bool "Check accesses to vmalloc allocations" depends on HAVE_ARCH_KASAN_VMALLOC diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d401fb770f67f..15c718782c1fa 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -169,23 +169,13 @@ struct kasan_track { depot_stack_handle_t stack; }; -#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) -#define KASAN_NR_FREE_STACKS 5 -#else -#define KASAN_NR_FREE_STACKS 1 -#endif - struct kasan_alloc_meta { struct kasan_track alloc_track; /* Generic mode stores free track in kasan_free_meta. */ #ifdef CONFIG_KASAN_GENERIC depot_stack_handle_t aux_stack[2]; #else - struct kasan_track free_track[KASAN_NR_FREE_STACKS]; -#endif -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; - u8 free_track_idx; + struct kasan_track free_track; #endif }; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index e25d2166e813d..35cf3cae4aa45 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -5,37 +5,9 @@ */ #include "kasan.h" -#include "../slab.h" const char *kasan_get_bug_type(struct kasan_report_info *info) { -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct slab *slab; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - slab = kasan_addr_to_slab(addr); - if (slab) { - cache = slab->slab_cache; - object = nearest_obj(cache, slab, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } -#endif - /* * If access_size is a negative number, then it has reason to be * defined as out-of-bounds bug type. diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 1ba3c8399f72c..e0e5de8ce834d 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -30,39 +30,22 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif - - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); + kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - int i = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return NULL; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; - } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif - - return &alloc_meta->free_track[i]; + return &alloc_meta->free_track; } From 64b4bdbdd86fcd01298eb3f6e4ffed1c84d493b6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:21 +0200 Subject: [PATCH 087/352] kasan: introduce kasan_print_aux_stacks Add a kasan_print_aux_stacks() helper that prints the auxiliary stack traces for the Generic mode. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/67c7a9ea6615533762b1f8ccc267cd7f9bafb749.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 6 ++++++ mm/kasan/report.c | 15 +-------------- mm/kasan/report_generic.c | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 15c718782c1fa..30ff341b6d35e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -266,6 +266,12 @@ void kasan_print_address_stack_frame(const void *addr); static inline void kasan_print_address_stack_frame(const void *addr) { } #endif +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index fe3f606b3a986..cd9f5c7fc6db1 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -270,20 +270,7 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object, pr_err("\n"); } -#ifdef CONFIG_KASAN_GENERIC - if (!alloc_meta) - return; - if (alloc_meta->aux_stack[0]) { - pr_err("Last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[0]); - pr_err("\n"); - } - if (alloc_meta->aux_stack[1]) { - pr_err("Second to last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[1]); - pr_err("\n"); - } -#endif + kasan_print_aux_stacks(cache, object); } static void describe_object(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 6689fb9a919b1..348dc207d4623 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,6 +132,26 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + #ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, From 9c822ce1f6b4acd9fe9711ff81bfba8ae917527c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:22 +0200 Subject: [PATCH 088/352] kasan: introduce kasan_get_alloc_track Add a kasan_get_alloc_track() helper that fetches alloc_track for a slab object and use this helper in the common reporting code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/0c365a35f4a833fff46f9d42c3212b32f7166556.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 14 +++++++++++++- mm/kasan/kasan.h | 4 +++- mm/kasan/report.c | 8 ++++---- mm/kasan/tags.c | 14 +++++++++++++- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 98c451a3b01f8..f212b9ae57b59 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -381,8 +381,20 @@ void kasan_save_free_info(struct kmem_cache *cache, *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) return NULL; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30ff341b6d35e..b65a51349c51b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -283,8 +283,10 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); + void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd9f5c7fc6db1..5d225d7d9c4c7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -255,12 +255,12 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, static void describe_object_stacks(struct kmem_cache *cache, void *object, const void *addr, u8 tag) { - struct kasan_alloc_meta *alloc_meta; + struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) { - print_track(&alloc_meta->alloc_track, "Allocated"); + alloc_track = kasan_get_alloc_track(cache, object); + if (alloc_track) { + print_track(alloc_track, "Allocated"); pr_err("\n"); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index e0e5de8ce834d..7b1fc8e7c99c9 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -38,8 +38,20 @@ void kasan_save_free_info(struct kmem_cache *cache, kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; From 00b613edbc02bfbd3cfa4ecd91756a173d331d06 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:23 +0200 Subject: [PATCH 089/352] kasan: introduce kasan_init_object_meta Add a kasan_init_object_meta() helper that initializes metadata for a slab object and use it in the common code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/47c12938fc7f8105e7aaa592527c0e9d3c81fc37.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 10 +++------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 2 ++ mm/kasan/tags.c | 9 +++++++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 93e64e1b44131..18107675a7fe0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -313,13 +313,9 @@ static inline u8 assign_tag(struct kmem_cache *cache, void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - if (kasan_stack_collection_enabled()) { - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); - } + /* Initialize per-object metadata if it is present. */ + if (kasan_stack_collection_enabled()) + kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ object = set_tag(object, assign_tag(cache, object, true)); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f212b9ae57b59..5462ddbc21e68 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,15 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b65a51349c51b..2c8c3cce7bc64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -279,6 +279,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 7b1fc8e7c99c9..2e200969a4b84 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; From 69c1abafc285f4f72d26070b7c190b559050b9ac Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:24 +0200 Subject: [PATCH 090/352] kasan: clear metadata functions for tag-based modes Remove implementations of the metadata-related functions for the tag-based modes. The following patches in the series will provide alternative implementations. As of this patch, the tag-based modes no longer collect alloc and free stack traces. This functionality will be restored later in the series. Link: https://lkml.kernel.org/r/470fbe5d15e8015092e76e395de354be18ccceab.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/tags.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 2e200969a4b84..f11c89505c778 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,54 +19,25 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); } void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; - - kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; + return NULL; } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->free_track; + return NULL; } From 5853f50d5a4ddff4cc9930059fbe432f8c2464b6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:25 +0200 Subject: [PATCH 091/352] kasan: move kasan_get_*_meta to generic.c Move the implementations of kasan_get_alloc/free_meta() to generic.c, as the common KASAN code does not use these functions anymore. Also drop kasan_reset_tag() from the implementation, as the Generic mode does not tag pointers. Link: https://lkml.kernel.org/r/ffcfc0ad654d78a2ef4ca054c943ddb4e5ca477b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 19 ------------------- mm/kasan/generic.c | 17 +++++++++++++++++ mm/kasan/kasan.h | 14 +++++++------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 18107675a7fe0..19ddc0ed0e7bd 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -229,25 +229,6 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) sizeof(struct kasan_free_meta) : 0); } -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object) -{ - if (!cache->kasan_info.alloc_meta_offset) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; -} - -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; -} -#endif - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5462ddbc21e68..fa654cb96a0dc 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,23 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { struct kasan_alloc_meta *alloc_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2c8c3cce7bc64..fdd577f3eb9d7 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -209,13 +209,6 @@ struct kunit_kasan_status { }; #endif -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object); -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object); -#endif - #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) @@ -281,6 +274,13 @@ struct slab *kasan_addr_to_slab(const void *addr); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +#ifdef CONFIG_KASAN_GENERIC +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#endif + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); From ff9e5a6d9a8113c3027879ab5424811e0b73b82a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:26 +0200 Subject: [PATCH 092/352] kasan: introduce kasan_requires_meta Add a kasan_requires_meta() helper that indicates whether the enabled KASAN mode requires per-object metadata and use this helper in the common code. Also hide kasan_init_object_meta() under CONFIG_KASAN_GENERIC ifdef check, as Generic is the only mode that uses per-object metadata. To allow for a potential future change that makes Generic KASAN support the kasan.stacktrace command-line parameter, let kasan_requires_meta() return kasan_stack_collection_enabled() instead of simply returning true. Link: https://lkml.kernel.org/r/cf837e9996246aaaeebf704ccf8ec26a34fcf64f.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 +++++-------- mm/kasan/kasan.h | 33 +++++++++++++++++++++++++++++---- mm/kasan/tags.c | 4 ---- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 19ddc0ed0e7bd..d0300954d76be 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,13 +88,10 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* - * Only allow cache merging when stack collection is disabled and no metadata - * is present. - */ +/* Only allow cache merging when no per-object metadata is present. */ slab_flags_t __kasan_never_merge(void) { - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) return SLAB_KASAN; return 0; } @@ -152,7 +149,7 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return; ok_size = *size; @@ -220,7 +217,7 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) size_t __kasan_metadata_size(struct kmem_cache *cache) { - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + @@ -295,7 +292,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { /* Initialize per-object metadata if it is present. */ - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fdd577f3eb9d7..1736abd661b6a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -43,7 +43,7 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } -#else +#else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_stack_collection_enabled(void) { @@ -60,7 +60,31 @@ static inline bool kasan_sync_fault_possible(void) return true; } -#endif +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -272,13 +296,14 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); -void kasan_init_object_meta(struct kmem_cache *cache, const void *object); - #ifdef CONFIG_KASAN_GENERIC +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index f11c89505c778..4f24669085e92 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,10 +17,6 @@ #include "kasan.h" -void kasan_init_object_meta(struct kmem_cache *cache, const void *object) -{ -} - void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } From d19630423ebd5efac69b2eb74f4ac9a57630acd1 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:27 +0200 Subject: [PATCH 093/352] kasan: introduce kasan_init_cache_meta Add a kasan_init_cache_meta() helper that initializes metadata-related cache parameters and use this helper in the common KASAN code. Put the implementation of this new helper into generic.c, as only the Generic mode uses per-object metadata. Link: https://lkml.kernel.org/r/a6d7ea01876eb36472c9879f7b23f1b24766276e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 80 ++-------------------------------------------- mm/kasan/generic.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 2 ++ 3 files changed, 83 insertions(+), 78 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d0300954d76be..b6a74fe5e740d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -118,28 +118,9 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -/* - * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. - * For larger allocations larger redzones are used. - */ -static inline unsigned int optimal_redzone(unsigned int object_size) -{ - return - object_size <= 64 - 16 ? 16 : - object_size <= 128 - 32 ? 32 : - object_size <= 512 - 64 ? 64 : - object_size <= 4096 - 128 ? 128 : - object_size <= (1 << 14) - 256 ? 256 : - object_size <= (1 << 15) - 512 ? 512 : - object_size <= (1 << 16) - 1024 ? 1024 : 2048; -} - void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { - unsigned int ok_size; - unsigned int optimal_size; - /* * SLAB_KASAN is used to mark caches as ones that are sanitized by * KASAN. Currently this flag is used in two places: @@ -149,65 +130,8 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_requires_meta()) - return; - - ok_size = *size; - - /* Add alloc meta into redzone. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); - - /* - * If alloc meta doesn't fit, don't add it. - * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal - * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for - * larger sizes. - */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.alloc_meta_offset = 0; - *size = ok_size; - /* Continue, since free meta might still fit. */ - } - - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - - /* - * Add free meta into redzone when it's not possible to store - * it in the object. This is the case when: - * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can - * be touched after it was freed, or - * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation, or - * 3. Object is too small. - * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. - */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; - } - } - - /* Calculate size with optimal redzone. */ - optimal_size = cache->object_size + optimal_redzone(cache->object_size); - /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ - if (optimal_size > KMALLOC_MAX_SIZE) - optimal_size = KMALLOC_MAX_SIZE; - /* Use optimal size if the size with added metas is not large enough. */ - if (*size < optimal_size) - *size = optimal_size; + if (kasan_requires_meta()) + kasan_init_cache_meta(cache, size); } void __kasan_cache_create_kmalloc(struct kmem_cache *cache) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index fa654cb96a0dc..73aea784040a2 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,85 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +{ + unsigned int ok_size; + unsigned int optimal_size; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* Only the generic mode uses free meta or flexible redzones. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + return; + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1736abd661b6a..6da35370ba37f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -297,12 +297,14 @@ struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); #else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif From e5fe7da7d9ddf6f06a8d9a6b57d011dede7b5664 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:28 +0200 Subject: [PATCH 094/352] kasan: drop CONFIG_KASAN_GENERIC check from kasan_init_cache_meta As kasan_init_cache_meta() is only defined for the Generic mode, it does not require the CONFIG_KASAN_GENERIC check. Link: https://lkml.kernel.org/r/211f8f2b213aa91e9148ca63342990b491c4917a.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 73aea784040a2..5125fad76f70a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -367,12 +367,6 @@ void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) /* Continue, since free meta might still fit. */ } - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - /* * Add free meta into redzone when it's not possible to store * it in the object. This is the case when: From 98244a61581c4d9643ce0bb209298cf53f446eb3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:29 +0200 Subject: [PATCH 095/352] kasan: only define kasan_metadata_size for Generic mode KASAN provides a helper for calculating the size of per-object metadata stored in the redzone. As now only the Generic mode uses per-object metadata, only define kasan_metadata_size() for this mode. Link: https://lkml.kernel.org/r/8f81d4938b80446bc72538a08217009f328a3e23.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++--------- mm/kasan/common.c | 11 ----------- mm/kasan/generic.c | 11 +++++++++++ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b092277bf48d6..027df75995731 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -150,14 +150,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) __kasan_cache_create_kmalloc(cache); } -size_t __kasan_metadata_size(struct kmem_cache *cache); -static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) -{ - if (kasan_enabled()) - return __kasan_metadata_size(cache); - return 0; -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -282,7 +274,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} -static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -333,6 +324,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +size_t kasan_metadata_size(struct kmem_cache *cache); + void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); @@ -340,6 +333,12 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return 0; +} + static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b6a74fe5e740d..7c79c560315d3 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -139,17 +139,6 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) cache->kasan_info.is_kmalloc = true; } -size_t __kasan_metadata_size(struct kmem_cache *cache) -{ - if (!kasan_requires_meta()) - return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - ((cache->kasan_info.free_meta_offset && - cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? - sizeof(struct kasan_free_meta) : 0); -} - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5125fad76f70a..806ab92032c3b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -427,6 +427,17 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) __memset(alloc_meta, 0, sizeof(*alloc_meta)); } +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + if (!kasan_requires_meta()) + return 0; + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); From cf5a47103296c0fc4683a80ef15a3f0a71f03d09 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:30 +0200 Subject: [PATCH 096/352] kasan: only define kasan_never_merge for Generic mode KASAN prevents merging of slab caches whose objects have per-object metadata stored in redzones. As now only the Generic mode uses per-object metadata, define kasan_never_merge() only for this mode. Link: https://lkml.kernel.org/r/81ed01f29ff3443580b7e2fe362a8b47b1e8006d.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ mm/kasan/common.c | 8 -------- mm/kasan/generic.c | 8 ++++++++ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 027df75995731..9743d4b3a9185 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -103,14 +103,6 @@ struct kasan_cache { bool is_kmalloc; }; -slab_flags_t __kasan_never_merge(void); -static __always_inline slab_flags_t kasan_never_merge(void) -{ - if (kasan_enabled()) - return __kasan_never_merge(); - return 0; -} - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -261,10 +253,6 @@ static __always_inline bool kasan_check_byte(const void *addr) #else /* CONFIG_KASAN */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} @@ -325,6 +313,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC size_t kasan_metadata_size(struct kmem_cache *cache); +slab_flags_t kasan_never_merge(void); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -338,6 +327,11 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } +/* And thus nothing prevents cache merging. */ +static inline slab_flags_t kasan_never_merge(void) +{ + return 0; +} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c79c560315d3..c2690e9380303 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,14 +88,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* Only allow cache merging when no per-object metadata is present. */ -slab_flags_t __kasan_never_merge(void) -{ - if (kasan_requires_meta()) - return SLAB_KASAN; - return 0; -} - void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 806ab92032c3b..25333bf3c99f2 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,14 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. From 0d11dd03ab9fdc1202406754d763311e4ed5c80a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:31 +0200 Subject: [PATCH 097/352] kasan: only define metadata offsets for Generic mode Hide the definitions of alloc_meta_offset and free_meta_offset under an ifdef CONFIG_KASAN_GENERIC check, as these fields are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/d4bafa0534facafd1a23c465a94261e64f366493.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9743d4b3a9185..a212c2e3f32de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -98,8 +98,10 @@ static inline bool kasan_has_integrated_init(void) #ifdef CONFIG_KASAN struct kasan_cache { +#ifdef CONFIG_KASAN_GENERIC int alloc_meta_offset; int free_meta_offset; +#endif bool is_kmalloc; }; From 9c149b1b3ce795d1a30eda007c3f3f72069027b0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:32 +0200 Subject: [PATCH 098/352] kasan: only define metadata structs for Generic mode Hide the definitions of kasan_alloc_meta and kasan_free_meta under an ifdef CONFIG_KASAN_GENERIC check, as these structures are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/8d2aabff8c227c444a3f62edf87d5630beb77640.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 6da35370ba37f..cae60e4d88426 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -193,14 +193,12 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_GENERIC + struct kasan_alloc_meta { struct kasan_track alloc_track; - /* Generic mode stores free track in kasan_free_meta. */ -#ifdef CONFIG_KASAN_GENERIC + /* Free track is stored in kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; -#else - struct kasan_track free_track; -#endif }; struct qlist_node { @@ -219,12 +217,12 @@ struct qlist_node { * After that, slab allocator stores the freelist pointer in the object. */ struct kasan_free_meta { -#ifdef CONFIG_KASAN_GENERIC struct qlist_node quarantine_link; struct kasan_track free_track; -#endif }; +#endif /* CONFIG_KASAN_GENERIC */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { From 43aa19dba543b1f366ca2af383643e2767e65a72 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:33 +0200 Subject: [PATCH 099/352] kasan: only define kasan_cache_create for Generic mode Right now, kasan_cache_create() assigns SLAB_KASAN for all KASAN modes and then sets up metadata-related cache parameters for the Generic mode. SLAB_KASAN is used in two places: 1. In slab_ksize() to account for per-object metadata when calculating the size of the accessible memory within the object. 2. In slab_common.c via kasan_never_merge() to prevent merging of caches with per-object metadata. Both cases are only relevant when per-object metadata is present, which is only the case with the Generic mode. Thus, assign SLAB_KASAN and define kasan_cache_create() only for the Generic mode. Also update the SLAB_KASAN-related comment. Link: https://lkml.kernel.org/r/61faa2aa1906e2d02c97d00ddf99ce8911dda095.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ include/linux/slab.h | 2 +- mm/kasan/common.c | 16 ---------------- mm/kasan/generic.c | 17 ++++++++++++++++- 4 files changed, 23 insertions(+), 30 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a212c2e3f32de..d811b3d7d2a15 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -128,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page, __kasan_unpoison_pages(page, order, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags); -static __always_inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, slab_flags_t *flags) -{ - if (kasan_enabled()) - __kasan_cache_create(cache, size, flags); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache); static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) { @@ -260,9 +251,6 @@ static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_unpoison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, - slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -316,6 +304,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} size_t kasan_metadata_size(struct kmem_cache *cache); slab_flags_t kasan_never_merge(void); +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -334,6 +324,10 @@ static inline slab_flags_t kasan_never_merge(void) { return 0; } +/* And no cache-related metadata initialization is required. */ +static inline void kasan_cache_create(struct kmem_cache *cache, + unsigned int *size, + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/slab.h b/include/linux/slab.h index 352e3f082acce..617a39f7db466 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,7 +106,7 @@ # define SLAB_ACCOUNT 0 #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 diff --git a/mm/kasan/common.c b/mm/kasan/common.c index c2690e9380303..8efa631909514 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -110,22 +110,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) -{ - /* - * SLAB_KASAN is used to mark caches as ones that are sanitized by - * KASAN. Currently this flag is used in two places: - * 1. In slab_ksize() when calculating the size of the accessible - * memory within the object. - * 2. In slab_common.c to prevent merging of sanitized caches. - */ - *flags |= SLAB_KASAN; - - if (kasan_requires_meta()) - kasan_init_cache_meta(cache, size); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache) { cache->kasan_info.is_kmalloc = true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 25333bf3c99f2..f6bef347de870 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -352,11 +352,26 @@ static inline unsigned int optimal_redzone(unsigned int object_size) object_size <= (1 << 16) - 1024 ? 1024 : 2048; } -void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) { unsigned int ok_size; unsigned int optimal_size; + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + ok_size = *size; /* Add alloc meta into redzone. */ From e9f3ee4c762c1d30dc228831137421c96bc4fe40 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:34 +0200 Subject: [PATCH 100/352] kasan: pass tagged pointers to kasan_save_alloc/free_info Pass tagged pointers to kasan_save_alloc/free_info(). This is a preparatory patch to simplify other changes in the series. Link: https://lkml.kernel.org/r/d5bc48cfcf0dca8269dc3ed863047e4d4d2030f1.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 6 ++---- mm/kasan/generic.c | 3 +-- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 3 +-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8efa631909514..f8e16a2421978 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -193,13 +193,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine, bool init) { - u8 tag; void *tagged_object; if (!kasan_arch_is_ready()) return false; - tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); @@ -228,7 +226,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_save_free_info(cache, object, tag); + kasan_save_free_info(cache, tagged_object); return kasan_quarantine_put(cache, object); } @@ -317,7 +315,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - kasan_save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; } diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f6bef347de870..aff39af3c532a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -500,8 +500,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_set_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cae60e4d88426..cca49ab029f1c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -309,7 +309,7 @@ static inline void kasan_init_object_meta(struct kmem_cache *cache, const void * depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 4f24669085e92..fd11d10a4ffc6 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -21,8 +21,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { } From 30df7d114a4495a692d1e3816c41923b2d2b7e32 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:35 +0200 Subject: [PATCH 101/352] kasan: move kasan_get_alloc/free_track definitions Move the definitions of kasan_get_alloc/free_track() to report_*.c, as they belong with other the reporting code. Link: https://lkml.kernel.org/r/0cb15423956889b3905a0174b58782633bbbd72e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 21 --------------------- mm/kasan/report_generic.c | 21 +++++++++++++++++++++ mm/kasan/report_tags.c | 12 ++++++++++++ mm/kasan/tags.c | 12 ------------ 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index aff39af3c532a..d8b5590f9484b 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -512,24 +512,3 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; -} diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 348dc207d4623..74d21786ef091 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -127,6 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) + return NULL; + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + return &kasan_get_free_meta(cache, object)->free_track; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 35cf3cae4aa45..79b6497d8a81b 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -21,3 +21,15 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } + +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + return NULL; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + return NULL; +} diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index fd11d10a4ffc6..39a0481e5228c 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -24,15 +24,3 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) void kasan_save_free_info(struct kmem_cache *cache, void *object) { } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; -} From 4e881e30ff03ec565fb6d232352f262eb4b9e6d2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:36 +0200 Subject: [PATCH 102/352] kasan: cosmetic changes in report.c Do a few non-functional style fixes for the code in report.c. Link: https://lkml.kernel.org/r/b728eae71f3ea505a885449724de21cf3f476a7b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5d225d7d9c4c7..83f420a28c0b2 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -200,25 +200,22 @@ static void print_error_description(struct kasan_report_info *info) static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); - if (track->stack) { + if (track->stack) stack_depot_print(track->stack); - } else { + else pr_err("(stack is not available)\n"); - } } struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_slab(addr); return NULL; } From f8265f46d56b1a1da7f5de84143d8d1d0a8a1b17 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:37 +0200 Subject: [PATCH 103/352] kasan: use virt_addr_valid in kasan_addr_to_page/slab Instead of open-coding the validity checks for addr in kasan_addr_to_page/slab(), use the virt_addr_valid() helper. Link: https://lkml.kernel.org/r/c22a4850d74d7430f8a6c08216fd55c2860a2b9e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 83f420a28c0b2..570f9419b90cc 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -208,14 +208,14 @@ static void print_track(struct kasan_track *track, const char *prefix) struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_slab(addr); return NULL; } From 9d1b324f848e39c0808c952f4cd381d1ca7574c8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:38 +0200 Subject: [PATCH 104/352] kasan: use kasan_addr_to_slab in print_address_description Use the kasan_addr_to_slab() helper in print_address_description() instead of separately invoking PageSlab() and page_slab(). Link: https://lkml.kernel.org/r/8b744fbf8c3c7fc5d34329ec70b60ee5c8dba66c.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 7 +++++++ mm/kasan/report.c | 11 ++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f8e16a2421978..50f4338b477f2 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,6 +30,13 @@ #include "kasan.h" #include "../slab.h" +struct slab *kasan_addr_to_slab(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_slab(addr); + return NULL; +} + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 570f9419b90cc..cd31b3b89ca15 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,13 +213,6 @@ struct page *kasan_addr_to_page(const void *addr) return NULL; } -struct slab *kasan_addr_to_slab(const void *addr) -{ - if (virt_addr_valid(addr)) - return virt_to_slab(addr); - return NULL; -} - static void describe_object_addr(struct kmem_cache *cache, void *object, const void *addr) { @@ -297,12 +290,12 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { struct page *page = kasan_addr_to_page(addr); + struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (page && PageSlab(page)) { - struct slab *slab = page_slab(page); + if (slab) { struct kmem_cache *cache = slab->slab_cache; void *object = nearest_obj(cache, slab, addr); From 63127df047ba94f6735decbdb8e95e5ec848b9ee Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:39 +0200 Subject: [PATCH 105/352] kasan: make kasan_addr_to_page static As kasan_addr_to_page() is only used in report.c, rename it to addr_to_page() and make it static. Link: https://lkml.kernel.org/r/66c1267200fe0c16e2ac8847a9315fda041918cb.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 1 - mm/kasan/report.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cca49ab029f1c..4fddfdb08abf1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,7 +291,6 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); -struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd31b3b89ca15..ac526c10ebff7 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -206,7 +206,7 @@ static void print_track(struct kasan_track *track, const char *prefix) pr_err("(stack is not available)\n"); } -struct page *kasan_addr_to_page(const void *addr) +static inline struct page *addr_to_page(const void *addr) { if (virt_addr_valid(addr)) return virt_to_head_page(addr); @@ -289,7 +289,7 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { - struct page *page = kasan_addr_to_page(addr); + struct page *page = addr_to_page(addr); struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); From 5c074e20ce6c1626666fd65480d5a6a832b01364 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:40 +0200 Subject: [PATCH 106/352] kasan: simplify print_report To simplify reading the implementation of print_report(), remove the tagged_addr variable and rename untagged_addr to addr. Link: https://lkml.kernel.org/r/f64f5f1093b3c06896bf0f850c5d9e661313fcb2.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ac526c10ebff7..dc38ada86f85d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -397,17 +397,16 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *tagged_addr = info->access_addr; - void *untagged_addr = kasan_reset_tag(tagged_addr); - u8 tag = get_tag(tagged_addr); + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); print_error_description(info); - if (addr_has_metadata(untagged_addr)) + if (addr_has_metadata(addr)) kasan_print_tags(tag, info->first_bad_addr); pr_err("\n"); - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, tag); + if (addr_has_metadata(addr)) { + print_address_description(addr, tag); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); From 45eca80d7f419336a3cceef33cda2ab7c7e33ac9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:41 +0200 Subject: [PATCH 107/352] kasan: introduce complete_report_info Introduce a complete_report_info() function that fills in the first_bad_addr field of kasan_report_info instead of doing it in kasan_report_*(). This function will be extended in the next patch. Link: https://lkml.kernel.org/r/8eb1a9bd01f5d31eab4524da54a101b8720b469e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 5 ++++- mm/kasan/report.c | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4fddfdb08abf1..7e07115873d3b 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,12 +153,15 @@ enum kasan_report_type { }; struct kasan_report_info { + /* Filled in by kasan_report_*(). */ enum kasan_report_type type; void *access_addr; - void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index dc38ada86f85d..0c2e7a58095d9 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -413,6 +413,17 @@ static void print_report(struct kasan_report_info *info) } } +static void complete_report_info(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; +} + void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; @@ -430,11 +441,12 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty info.type = type; info.access_addr = ptr; - info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; info.is_write = false; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&flags, ptr); @@ -463,11 +475,12 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; - info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&irq_flags, ptr); From 49eeb5769e4b7a1322d40e1309703e2e51924260 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:42 +0200 Subject: [PATCH 108/352] kasan: fill in cache and object in complete_report_info Add cache and object fields to kasan_report_info and fill them in in complete_report_info() instead of fetching them in the middle of the report printing code. This allows the reporting code to get access to the object information before starting printing the report. One of the following patches uses this information to determine the bug type with the tag-based modes. Link: https://lkml.kernel.org/r/23264572cb2cbb8f0efbb51509b6757eb3cc1fc9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7e07115873d3b..b8fa1e50f3d48 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -162,6 +162,8 @@ struct kasan_report_info { /* Filled in by the common reporting code. */ void *first_bad_addr; + struct kmem_cache *cache; + void *object; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0c2e7a58095d9..763de8e68887f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -287,19 +287,16 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(void *addr, u8 tag) +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) { struct page *page = addr_to_page(addr); - struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (slab) { - struct kmem_cache *cache = slab->slab_cache; - void *object = nearest_obj(cache, slab, addr); - - describe_object(cache, object, addr, tag); + if (info->cache && info->object) { + describe_object(info->cache, info->object, addr, tag); pr_err("\n"); } @@ -406,7 +403,7 @@ static void print_report(struct kasan_report_info *info) pr_err("\n"); if (addr_has_metadata(addr)) { - print_address_description(addr, tag); + print_address_description(addr, tag, info); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); @@ -416,12 +413,20 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { void *addr = kasan_reset_tag(info->access_addr); + struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( info->access_addr, info->access_size); else info->first_bad_addr = addr; + + slab = kasan_addr_to_slab(addr); + if (slab) { + info->cache = slab->slab_cache; + info->object = nearest_obj(info->cache, slab, addr); + } else + info->cache = info->object = NULL; } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) From c1851344512d9550148daf8ad8ad91f0adf4198c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:43 +0200 Subject: [PATCH 109/352] kasan: rework function arguments in report.c Pass a pointer to kasan_report_info to describe_object() and describe_object_stacks(), instead of passing the structure's fields. The untagged pointer and the tag are still passed as separate arguments to some of the functions to avoid duplicating the untagging logic. This is preparatory change for the next patch. Link: https://lkml.kernel.org/r/2e0cdb91524ab528a3c2b12b6d8bcb69512fc4af.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 763de8e68887f..ec018f8499920 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,8 +213,8 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(struct kmem_cache *cache, void *object, - const void *addr) +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) { unsigned long access_addr = (unsigned long)addr; unsigned long object_addr = (unsigned long)object; @@ -242,33 +242,32 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(u8 tag, struct kasan_report_info *info) { struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_track = kasan_get_alloc_track(cache, object); + alloc_track = kasan_get_alloc_track(info->cache, info->object); if (alloc_track) { print_track(alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(cache, object, tag); + free_track = kasan_get_free_track(info->cache, info->object, tag); if (free_track) { print_track(free_track, "Freed"); pr_err("\n"); } - kasan_print_aux_stacks(cache, object); + kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object(const void *addr, u8 tag, + struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(cache, object, addr, tag); - describe_object_addr(cache, object, addr); + describe_object_stacks(tag, info); + describe_object_addr(addr, info->cache, info->object); } static inline bool kernel_or_module_addr(const void *addr) @@ -296,7 +295,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(info->cache, info->object, addr, tag); + describe_object(addr, tag, info); pr_err("\n"); } From f11575b30108911d20c3570bd91b2e6f991a4ed6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:44 +0200 Subject: [PATCH 110/352] kasan: introduce kasan_complete_mode_report_info Add bug_type and alloc/free_track fields to kasan_report_info and add a kasan_complete_mode_report_info() function that fills in these fields. This function is implemented differently for different KASAN mode. Change the reporting code to use the filled in fields instead of invoking kasan_get_bug_type() and kasan_get_alloc/free_track(). For the Generic mode, kasan_complete_mode_report_info() invokes these functions instead. For the tag-based modes, only the bug_type field is filled in; alloc/free_track are handled in the next patch. Using a single function that fills in these fields is required for the tag-based modes, as the values for all three fields are determined in a single procedure implemented in the following patch. Link: https://lkml.kernel.org/r/8432b861054fa8d0cee79a8877dedeaf3b677ca8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 33 +++++++++++++++++---------------- mm/kasan/report.c | 30 ++++++++++++++---------------- mm/kasan/report_generic.c | 32 +++++++++++++++++--------------- mm/kasan/report_tags.c | 13 +++---------- 4 files changed, 51 insertions(+), 57 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b8fa1e50f3d48..7df107dc400ac 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -146,6 +146,13 @@ static inline bool kasan_requires_meta(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, @@ -164,6 +171,11 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; }; /* Do not change the struct layout: compiler ABI. */ @@ -189,14 +201,7 @@ struct kasan_global { #endif }; -/* Structures for keeping alloc and free tracks. */ - -#define KASAN_STACK_DEPTH 64 - -struct kasan_track { - u32 pid; - depot_stack_handle_t stack; -}; +/* Structures for keeping alloc and free meta. */ #ifdef CONFIG_KASAN_GENERIC @@ -270,16 +275,16 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) void kasan_print_tags(u8 addr_tag, const void *addr); #else static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_report_info *info); -void kasan_metadata_fetch_row(char *buffer, void *row); - #if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else @@ -314,10 +319,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ec018f8499920..39e8e5a80b829 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -185,8 +185,7 @@ static void print_error_description(struct kasan_report_info *info) return; } - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -242,31 +241,25 @@ static void describe_object_addr(const void *addr, struct kmem_cache *cache, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(u8 tag, struct kasan_report_info *info) +static void describe_object_stacks(struct kasan_report_info *info) { - struct kasan_track *alloc_track; - struct kasan_track *free_track; - - alloc_track = kasan_get_alloc_track(info->cache, info->object); - if (alloc_track) { - print_track(alloc_track, "Allocated"); + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(info->cache, info->object, tag); - if (free_track) { - print_track(free_track, "Freed"); + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); pr_err("\n"); } kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(const void *addr, u8 tag, - struct kasan_report_info *info) +static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(tag, info); + describe_object_stacks(info); describe_object_addr(addr, info->cache, info->object); } @@ -295,7 +288,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(addr, tag, info); + describe_object(addr, info); pr_err("\n"); } @@ -426,6 +419,9 @@ static void complete_report_info(struct kasan_report_info *info) info->object = nearest_obj(info->cache, slab, addr); } else info->cache = info->object = NULL; + + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) @@ -443,6 +439,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); + memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; info.access_size = 0; @@ -477,6 +474,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&irq_flags, true); + memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; info.access_size = size; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 74d21786ef091..087c1d8c81456 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -127,25 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; + info->bug_type = get_bug_type(info); - return &alloc_meta->alloc_track; -} + if (!info->cache || !info->object) + return; -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } } void kasan_metadata_fetch_row(char *buffer, void *row) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 79b6497d8a81b..5cbac2cdb177b 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -6,7 +6,7 @@ #include "kasan.h" -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -22,14 +22,7 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; + info->bug_type = get_bug_type(info); } From d797195c90527c52d2d4f98336231a7b9dcb91b6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:45 +0200 Subject: [PATCH 111/352] kasan: implement stack ring for tag-based modes Implement storing stack depot handles for alloc/free stack traces for slab objects for the tag-based KASAN modes in a ring buffer. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The number of entries in the stack ring is fixed in this patch, but one of the following patches adds a command-line argument to control it. Link: https://lkml.kernel.org/r/692de14b6b6a1bc817fd55e4ad92fc1f83c1ab59.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 21 +++++++++++++ mm/kasan/report_tags.c | 71 ++++++++++++++++++++++++++++++++++++++++++ mm/kasan/tags.c | 50 +++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7df107dc400ac..cfff81139d67e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #ifndef __MM_KASAN_KASAN_H #define __MM_KASAN_KASAN_H +#include #include #include #include @@ -233,6 +234,26 @@ struct kasan_free_meta { #endif /* CONFIG_KASAN_GENERIC */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; +}; + +#define KASAN_STACK_RING_SIZE (32 << 10) + +struct kasan_stack_ring { + rwlock_t lock; + atomic64_t pos; + struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 5cbac2cdb177b..1b78136542bb6 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -4,8 +4,12 @@ * Copyright (c) 2020 Google, Inc. */ +#include + #include "kasan.h" +extern struct kasan_stack_ring stack_ring; + static const char *get_bug_type(struct kasan_report_info *info) { /* @@ -24,5 +28,72 @@ static const char *get_bug_type(struct kasan_report_info *info) void kasan_complete_mode_report_info(struct kasan_report_info *info) { + unsigned long flags; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 39a0481e5228c..07828021c1f55 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -6,6 +6,7 @@ * Copyright (c) 2020 Google, Inc. */ +#include #include #include #include @@ -16,11 +17,60 @@ #include #include "kasan.h" +#include "../slab.h" + +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring; + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); + + read_unlock_irqrestore(&stack_ring.lock, flags); +} void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { + save_stack_info(cache, object, flags, false); } void kasan_save_free_info(struct kmem_cache *cache, void *object) { + save_stack_info(cache, object, GFP_NOWAIT, true); } From e527d3c28443fdcdded901f0f5600110db6b5cdd Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 20 Sep 2022 20:58:07 +0200 Subject: [PATCH 112/352] kasan: initialize read-write lock in stack ring Use __RW_LOCK_UNLOCKED to initialize stack_ring.lock. Link: https://lkml.kernel.org/r/576182d194e27531e8090bad809e4136953895f4.1663700262.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: Yu Zhao Reviewed-by: Marco Elver Tested-by: Yu Zhao Signed-off-by: Andrew Morton --- mm/kasan/tags.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 07828021c1f55..a0524e037f499 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -22,7 +22,9 @@ /* Non-zero, as initial pointer values are 0. */ #define STACK_RING_BUSY_PTR ((void *)1) -struct kasan_stack_ring stack_ring; +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) From 4c6bf4b3f90da6b94200873817a270e590819b9c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:46 +0200 Subject: [PATCH 113/352] kasan: support kasan.stacktrace for SW_TAGS Add support for the kasan.stacktrace command-line argument for Software Tag-Based KASAN. The following patch adds a command-line argument for selecting the stack ring size, and, as the stack ring is supported by both the Software and the Hardware Tag-Based KASAN modes, it is natural that both of them have support for kasan.stacktrace too. Link: https://lkml.kernel.org/r/3b43059103faa7f8796017847b7d674b658f11b5.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 15 ++++++----- mm/kasan/hw_tags.c | 39 +--------------------------- mm/kasan/kasan.h | 36 +++++++++++++++++--------- mm/kasan/sw_tags.c | 5 +++- mm/kasan/tags.c | 43 +++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 57 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 1772fd457fed9..7bd38c1810185 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -111,9 +111,15 @@ parameter can be used to control panic and reporting behaviour: report or also panic the kernel (default: ``report``). The panic happens even if ``kasan_multi_shot`` is enabled. -Hardware Tag-Based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore, it supports -additional boot parameters that allow disabling KASAN or controlling features: +Software and Hardware Tag-Based KASAN modes (see the section about various +modes below) support disabling stack trace collection: + +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on``). + +Hardware Tag-Based KASAN mode is intended for use in production as a security +mitigation. Therefore, it supports additional boot parameters that allow +disabling KASAN altogether or controlling its features: - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -132,9 +138,6 @@ additional boot parameters that allow disabling KASAN or controlling features: - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). -- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on``). - Error reports ~~~~~~~~~~~~~ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9ad8eff71b28d..b22c4f461cb0b 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -38,16 +38,9 @@ enum kasan_arg_vmalloc { KASAN_ARG_VMALLOC_ON, }; -enum kasan_arg_stacktrace { - KASAN_ARG_STACKTRACE_DEFAULT, - KASAN_ARG_STACKTRACE_OFF, - KASAN_ARG_STACKTRACE_ON, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* * Whether KASAN is enabled at all. @@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -/* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); -/* kasan.stacktrace=off/on */ -static int __init early_kasan_flag_stacktrace(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "off")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; - else if (!strcmp(arg, "on")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; - else - return -EINVAL; - - return 0; -} -early_param("kasan.stacktrace", early_kasan_flag_stacktrace); - static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_stacktrace) { - case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default is specified by kasan_flag_stacktrace definition. */ - break; - case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); - break; - case KASAN_ARG_STACKTRACE_ON: - static_branch_enable(&kasan_flag_stacktrace); - break; - } + kasan_init_tags(); /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cfff81139d67e..447baf1a7a2e4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -8,13 +8,31 @@ #include #include -#ifdef CONFIG_KASAN_HW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #include + +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + #include "../slab.h" DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -29,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void) return static_branch_likely(&kasan_flag_vmalloc); } -static inline bool kasan_stack_collection_enabled(void) -{ - return static_branch_unlikely(&kasan_flag_stacktrace); -} - static inline bool kasan_async_fault_possible(void) { return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; @@ -46,11 +59,6 @@ static inline bool kasan_sync_fault_possible(void) #else /* CONFIG_KASAN_HW_TAGS */ -static inline bool kasan_stack_collection_enabled(void) -{ - return true; -} - static inline bool kasan_async_fault_possible(void) { return false; @@ -410,6 +418,10 @@ static inline void kasan_enable_tagging(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 77f13f391b577..a3afaf2ad1b11 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void) for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); - pr_info("KernelAddressSanitizer initialized (sw-tags)\n"); + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); } /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index a0524e037f499..dd929ab166fb4 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,6 +19,17 @@ #include "kasan.h" #include "../slab.h" +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + /* Non-zero, as initial pointer values are 0. */ #define STACK_RING_BUSY_PTR ((void *)1) @@ -26,6 +37,38 @@ struct kasan_stack_ring stack_ring = { .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) }; +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); + +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } +} + static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) { From 3fa0d70a8236e5d4ded48b99ece8c10038bc061f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:47 +0200 Subject: [PATCH 114/352] kasan: dynamically allocate stack ring entries Instead of using a large static array, allocate the stack ring dynamically via memblock_alloc(). The size of the stack ring is controlled by a new kasan.stack_ring_size command-line parameter. When kasan.stack_ring_size is not provided, the default value of 32 << 10 is used. When the stack trace collection is disabled via kasan.stacktrace=off, the stack ring is not allocated. Link: https://lkml.kernel.org/r/03b82ab60db53427e9818e0b0c1971baa10c3cbc.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 4 +++- mm/kasan/kasan.h | 5 ++--- mm/kasan/report_tags.c | 4 ++-- mm/kasan/tags.c | 25 ++++++++++++++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 7bd38c1810185..5c93ab9150494 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -112,10 +112,12 @@ parameter can be used to control panic and reporting behaviour: if ``kasan_multi_shot`` is enabled. Software and Hardware Tag-Based KASAN modes (see the section about various -modes below) support disabling stack trace collection: +modes below) support altering stack trace collection behavior: - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). +- ``kasan.stack_ring_size=`` specifies the number of entries + in the stack ring (default: ``32768``). Hardware Tag-Based KASAN mode is intended for use in production as a security mitigation. Therefore, it supports additional boot parameters that allow diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 447baf1a7a2e4..abbcc1b0eec50 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -252,12 +252,11 @@ struct kasan_stack_ring_entry { bool is_free; }; -#define KASAN_STACK_RING_SIZE (32 << 10) - struct kasan_stack_ring { rwlock_t lock; + size_t size; atomic64_t pos; - struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; + struct kasan_stack_ring_entry *entries; }; #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 1b78136542bb6..57f7355377f14 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -56,11 +56,11 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * entries relevant to the buggy object can be overwritten. */ - for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) { if (alloc_found && free_found) break; - entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[i % stack_ring.size]; /* Paired with smp_store_release() in save_stack_info(). */ ptr = (void *)smp_load_acquire(&entry->ptr); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index dd929ab166fb4..67a222586846e 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,8 @@ #include "kasan.h" #include "../slab.h" +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -54,6 +57,16 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); +/* kasan.stack_ring_size= */ +static int __init early_kasan_flag_stack_ring_size(char *arg) +{ + if (!arg) + return -EINVAL; + + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); + void __init kasan_init_tags(void) { switch (kasan_arg_stacktrace) { @@ -67,6 +80,16 @@ void __init kasan_init_tags(void) static_branch_enable(&kasan_flag_stacktrace); break; } + + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); + } } static void save_stack_info(struct kmem_cache *cache, void *object, @@ -88,7 +111,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object, next: pos = atomic64_fetch_add(1, &stack_ring.pos); - entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[pos % stack_ring.size]; /* Detect stack ring entry slots that are being written to. */ old_ptr = READ_ONCE(entry->ptr); From 34cf17da8793e1a41ae486e41d22a3850a98f74b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:48 +0200 Subject: [PATCH 115/352] kasan: better identify bug types for tag-based modes Identify the bug type for the tag-based modes based on the stack trace entries found in the stack ring. If a free entry is found first (meaning that it was added last), mark the bug as use-after-free. If an alloc entry is found first, mark the bug as slab-out-of-bounds. Otherwise, assign the common bug type. This change returns the functionalify of the previously dropped CONFIG_KASAN_TAGS_IDENTIFY. Link: https://lkml.kernel.org/r/13ce7fa07d9d995caedd1439dfae4d51401842f2.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report_tags.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 57f7355377f14..d3510424d29be 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -10,7 +10,7 @@ extern struct kasan_stack_ring stack_ring; -static const char *get_bug_type(struct kasan_report_info *info) +static const char *get_common_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -37,9 +37,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - info->bug_type = get_bug_type(info); - - if (!info->cache || !info->object) + if (!info->cache || !info->object) { + info->bug_type = get_common_bug_type(info); return; } @@ -84,6 +83,13 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->free_track.pid = pid; info->free_track.stack = stack; free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) @@ -92,8 +98,19 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->alloc_track.pid = pid; info->alloc_track.stack = stack; alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; } } write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); } From 343339d48e4da5f85714af0f3f53c294c210bd07 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:49 +0200 Subject: [PATCH 116/352] kasan: add another use-after-free test Add a new use-after-free test that checks that KASAN detects use-after-free when another object was allocated in the same slot. This test is mainly relevant for the tag-based modes, which do not use quarantine. Once [1] is resolved, this test can be extended to check that the stack traces in the report point to the proper kmalloc/kfree calls. [1] https://bugzilla.kernel.org/show_bug.cgi?id=212203 Link: https://lkml.kernel.org/r/0659cfa15809dd38faa02bc0a59d0b5dbbd81211.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/test_kasan.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 58c1b01ccfe20..505f77ffad279 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -612,6 +612,29 @@ static void kmalloc_uaf2(struct kunit *test) kfree(ptr2); } +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + static void kfree_via_page(struct kunit *test) { char *ptr; @@ -1382,6 +1405,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), KUNIT_CASE(kfree_via_page), KUNIT_CASE(kfree_via_phys), KUNIT_CASE(kmem_cache_oob), From e1a0d2c8d42aac75220d5fdef9a438f11ae2e13b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 6 Sep 2022 00:18:36 +0200 Subject: [PATCH 117/352] kasan: move tests to mm/kasan/ Move KASAN tests to mm/kasan/ to keep the test code alongside the implementation. Link: https://lkml.kernel.org/r/676398f0aeecd47d2f8e3369ea0e95563f641a36.1662416260.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - lib/Makefile | 5 ----- mm/kasan/Makefile | 8 ++++++++ lib/test_kasan.c => mm/kasan/kasan_test.c | 2 +- lib/test_kasan_module.c => mm/kasan/kasan_test_module.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) rename lib/test_kasan.c => mm/kasan/kasan_test.c (99%) rename lib/test_kasan_module.c => mm/kasan/kasan_test_module.c (99%) diff --git a/MAINTAINERS b/MAINTAINERS index 17915d9a94bd0..61345a9a9d055 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10937,7 +10937,6 @@ F: arch/*/include/asm/*kasan.h F: arch/*/mm/kasan_init* F: include/linux/kasan*.h F: lib/Kconfig.kasan -F: lib/test_kasan*.c F: mm/kasan/ F: scripts/Makefile.kasan diff --git a/lib/Makefile b/lib/Makefile index 6dc0d6f8e57d7..d7d94102991b3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,11 +65,6 @@ obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o -obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o -CFLAGS_test_kasan.o += -fno-builtin -CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) -obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o -CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1f84df9c302e7..d4837bff3b60f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/lib/test_kasan.c b/mm/kasan/kasan_test.c similarity index 99% rename from lib/test_kasan.c rename to mm/kasan/kasan_test.c index 505f77ffad279..f25692def7813 100644 --- a/lib/test_kasan.c +++ b/mm/kasan/kasan_test.c @@ -25,7 +25,7 @@ #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) diff --git a/lib/test_kasan_module.c b/mm/kasan/kasan_test_module.c similarity index 99% rename from lib/test_kasan_module.c rename to mm/kasan/kasan_test_module.c index b112cbc835e90..e4ca82dc2c16d 100644 --- a/lib/test_kasan_module.c +++ b/mm/kasan/kasan_test_module.c @@ -13,7 +13,7 @@ #include #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" static noinline void __init copy_user_test(void) { From 54a7df604619f98da58e0469e67a4828950c8cb6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sun, 11 Sep 2022 01:25:30 +0200 Subject: [PATCH 118/352] kasan: better invalid/double-free report header Update the report header for invalid- and double-free bugs to contain the address being freed: BUG: KASAN: invalid-free in kfree+0x280/0x2a8 Free of addr ffff00000beac001 by task kunit_try_catch/99 Link: https://lkml.kernel.org/r/fce40f8dbd160972fe01a1ff39d0c426c310e4b7.1662852281.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/report.c | 23 ++++++++++++++++------- mm/kasan/report_generic.c | 3 ++- mm/kasan/report_tags.c | 2 +- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 39e8e5a80b829..df3602062bfd6 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -175,17 +175,14 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { - if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); - return; - } + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); - if (info->type == KASAN_REPORT_DOUBLE_FREE) { - pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); return; } - pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -420,6 +417,18 @@ static void complete_report_info(struct kasan_report_info *info) } else info->cache = info->object = NULL; + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; + } + /* Fill in mode-specific report info fields. */ kasan_complete_mode_report_info(info); } diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 087c1d8c81456..043c94b046054 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,7 +132,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) struct kasan_alloc_meta *alloc_meta; struct kasan_free_meta *free_meta; - info->bug_type = get_bug_type(info); + if (!info->bug_type) + info->bug_type = get_bug_type(info); if (!info->cache || !info->object) return; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index d3510424d29be..ecede06ef374a 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -37,7 +37,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - if (!info->cache || !info->object) { + if ((!info->cache || !info->object) && !info->bug_type) { info->bug_type = get_common_bug_type(info); return; } From 1d4f98fc1cd841c807609c5bf23ed111241e497f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Fri, 26 Aug 2022 08:06:31 +0300 Subject: [PATCH 119/352] mm/hmm/test: use char dev with struct device to get device node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM selftests use an in-kernel pseudo device to emulate device memory. The pseudo device registers a major device range for two or four pseudo device instances. User space has a script that reads /proc/devices in order to find the assigned major number, and sends that to mknod(1), once for each node. Change this to properly use cdev and struct device APIs. Delete the /proc/devices parsing from the user-space test script, now that it is unnecessary. Also, delete an unused field in struct dmirror_device: devmem. Link: https://lkml.kernel.org/r/20220826050631.25771-1-mpenttil@redhat.com Signed-off-by: Mika Penttilä Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Alistair Popple Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_hmm.c | 13 ++++++++++--- tools/testing/selftests/vm/test_hmm.sh | 10 ---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e3965cafd27cf..6a33f6b1b4651 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -107,8 +107,8 @@ struct dmirror_chunk { */ struct dmirror_device { struct cdev cdevice; - struct hmm_devmem *devmem; unsigned int zone_device_type; + struct device device; unsigned int devmem_capacity; unsigned int devmem_count; @@ -1390,7 +1390,14 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) cdev_init(&mdevice->cdevice, &dmirror_fops); mdevice->cdevice.owner = THIS_MODULE; - ret = cdev_add(&mdevice->cdevice, dev, 1); + device_initialize(&mdevice->device); + mdevice->device.devt = dev; + + ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id); + if (ret) + return ret; + + ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); if (ret) return ret; @@ -1416,7 +1423,7 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) kfree(mdevice->devmem_chunks); } - cdev_del(&mdevice->cdevice); + cdev_device_del(&mdevice->cdevice, &mdevice->device); } static int __init hmm_dmirror_init(void) diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 539c9371e592a..46e19b5d648d6 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -52,21 +52,11 @@ load_driver() usage fi fi - if [ $? == 0 ]; then - major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) - mknod /dev/hmm_dmirror0 c $major 0 - mknod /dev/hmm_dmirror1 c $major 1 - if [ $# -eq 2 ]; then - mknod /dev/hmm_dmirror2 c $major 2 - mknod /dev/hmm_dmirror3 c $major 3 - fi - fi } unload_driver() { modprobe -r $DRIVER > /dev/null 2>&1 - rm -f /dev/hmm_dmirror? } run_smoke() From 7cce5c15eccb6da0f599052a03d503aaf5c73d05 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 6 Sep 2022 23:18:47 +0800 Subject: [PATCH 120/352] mm/damon/core: iterate the regions list from current point in damon_set_regions() We iterate the whole regions list every time to get the first/last regions intersecting with the specific range in damon_set_regions(), in order to add new region or resize existing regions to fit in the specific range. Actually, it is unnecessary to iterate the new added regions and the front regions that have been checked. Just iterate the regions list from the current point using list_for_each_entry_from() every time to improve performance. The kunit tests passed: [PASSED] damon_test_apply_three_regions1 [PASSED] damon_test_apply_three_regions2 [PASSED] damon_test_apply_three_regions3 [PASSED] damon_test_apply_three_regions4 Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ mm/damon/core.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7b1f4a4882308..d54acec048d6f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -463,9 +463,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) return list_last_entry(&t->regions_list, struct damon_region, list); } +static inline struct damon_region *damon_first_region(struct damon_target *t) +{ + return list_first_entry(&t->regions_list, struct damon_region, list); +} + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) +#define damon_for_each_region_from(r, t) \ + list_for_each_entry_from(r, &t->regions_list, list) + #define damon_for_each_region_safe(r, next, t) \ list_for_each_entry_safe(r, next, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 9964b9d007686..5e00c04ceef04 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -195,6 +195,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; @@ -202,7 +203,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, range = &ranges[i]; /* Get the first/last regions intersecting with the range */ - damon_for_each_region(r, t) { + damon_for_each_region_from(r, t) { if (damon_intersect(r, range)) { if (!first) first = r; From 0b7bbd3c88c7835f9b00c626a815f9e646d6f90a Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Wed, 7 Sep 2022 16:41:16 +0800 Subject: [PATCH 121/352] mm/damon: simplify damon_ctx check in damon_sysfs_before_terminate In damon_sysfs_before_terminate(), it needs to check whether ctx->ops.id supports 'DAMON_OPS_VADDR' or 'DAMON_OPS_FVADDR', there we can use damon_target_has_pid() instead. Link: https://lkml.kernel.org/r/20220907084116.62053-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 941c3238ed753..91c678e7f6897 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2309,7 +2309,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); From 61bc87bf665e6b5fcfd6a1bdf5c243b7e4696ab3 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Wed, 7 Sep 2022 16:01:13 +0800 Subject: [PATCH 122/352] mm/page_owner.c: remove redundant drain_all_pages Remove an expensive and unnecessary operation as PCP pages are safely skipped when reading page owner.PCP pages can be skipped because PAGE_EXT_OWNER_ALLOCATED is cleared. With draining PCP pages, these pages are moved to buddy list so they can be identified as buddy pages and skipped quickly. Although it improved efficiency of PFN walker, the drain is guaranteed expensive that is unlikely to be offset by a slight increase in efficiency when skipping free pages. PAGE_EXT_OWNER_ALLOCATED is cleared in the page owner reset path below: free_unref_page -> free_unref_page_prepare -> free_pcp_prepare -> free_pages_prepare which do page owner reset -> free_unref_page_commit which add pages into pcp list Link: https://lkml.kernel.org/r/1662704326-15899-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662633204-10044-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662537673-9392-1-git-send-email-quic_zhenhuah@quicinc.com Signed-off-by: Zhenhua Huang Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 90023f938c19f..54f3e039fb483 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -524,8 +524,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; - drain_all_pages(NULL); - /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* From 30e7dcad2095bbe3465c1eb507bbd0b83fa87df3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:42 +0800 Subject: [PATCH 123/352] mm: reuse pageblock_start/end_pfn() macro Move pageblock_start_pfn/pageblock_end_pfn() into pageblock-flags.h, then they could be used somewhere else, not only in compaction, also use ALIGN_DOWN() instead of round_down() to be pair with ALIGN(), which should be same for pageblock usage. Link: https://lkml.kernel.org/r/20220907060844.126891-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 2 ++ mm/compaction.c | 2 -- mm/memblock.c | 2 +- mm/page_alloc.c | 13 ++++++------- mm/page_isolation.c | 11 +++++------ mm/page_owner.c | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 83c7248053a1e..a09b7fe6bbf8e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,8 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) +#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) /* Forward declaration */ struct page; diff --git a/mm/compaction.c b/mm/compaction.c index f72907c7cfefd..65bef5f78897c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) -#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) -#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) /* * Page order with-respect-to which proactive compaction diff --git a/mm/memblock.c b/mm/memblock.c index b5d3026979fcc..46fe7575f03c6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - start = round_down(start, pageblock_nr_pages); + start = pageblock_start_pfn(start); /* * If we had a previous bank, and there is a space diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e65a9ca06f3b4..624016ad59675 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,7 +544,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); #else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); #endif /* CONFIG_SPARSEMEM */ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } @@ -1857,7 +1857,7 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; - block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(block_start_pfn); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -2653,8 +2653,8 @@ int move_freepages_block(struct zone *zone, struct page *page, *num_movable = 0; pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages - 1; + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) @@ -6926,9 +6926,8 @@ static void __init init_unavailable_range(unsigned long spfn, u64 pgcnt = 0; for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; continue; } __init_single_page(pfn_to_page(pfn), pfn, zone, node); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index eb3a68ca92ad9..5819cb9c62f37 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e struct zone *zone = page_zone(page); unsigned long pfn; - VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) != - ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages)); + VM_BUG_ON(pageblock_start_pfn(start_pfn) != + pageblock_start_pfn(end_pfn - 1)); if (is_migrate_cma_page(page)) { /* @@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * to avoid redundant checks. */ check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages), + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, @@ -532,7 +532,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; /* isolation is done at page block granularity */ - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; bool skip_isolation = false; @@ -579,10 +579,9 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); - for (pfn = isolate_start; pfn < isolate_end; pfn += pageblock_nr_pages) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 54f3e039fb483..2d27f532df4c1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -297,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); @@ -635,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { From a85fe8710e4433d6ce788f29d774b43023abce98 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:43 +0800 Subject: [PATCH 124/352] mm: add pageblock_align() macro Add pageblock_align() macro and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/memblock.c | 4 ++-- mm/page_isolation.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index a09b7fe6bbf8e..293c76630fa8b 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,7 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/memblock.c b/mm/memblock.c index 46fe7575f03c6..511d4783dcf1d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); } #ifdef CONFIG_SPARSEMEM if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5819cb9c62f37..fa82faa07dafb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -533,7 +533,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, struct page *page; /* isolation is done at page block granularity */ unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); int ret; bool skip_isolation = false; @@ -580,7 +580,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); for (pfn = isolate_start; pfn < isolate_end; From f743f03fa07531502276aab817ffb63b607667b1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:44 +0800 Subject: [PATCH 125/352] mm: add pageblock_aligned() macro Add pageblock_aligned() and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Cc: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/compaction.c | 8 ++++---- mm/memory_hotplug.c | 6 ++---- mm/page_alloc.c | 17 +++++++---------- mm/page_isolation.c | 2 +- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 293c76630fa8b..5f1ae07d724b8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,6 +54,7 @@ extern unsigned int pageblock_order; #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/compaction.c b/mm/compaction.c index 65bef5f78897c..c4e4453187a2c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -402,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, if (cc->ignore_skip_hint) return false; - if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + if (!pageblock_aligned(pfn)) return false; skip = get_pageblock_skip(page); @@ -884,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; page = NULL; @@ -1936,7 +1936,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + if (pageblock_aligned(low_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; @@ -2122,7 +2122,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * migration source is unmovable/reclaimable but it's not worth * special casing. */ - if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ae1f98548b10..fd40f7e9f1763 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; @@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 624016ad59675..b41e393c1e8d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1892,15 +1892,14 @@ static void __init deferred_free_range(unsigned long pfn, page = pfn_to_page(pfn); /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) + if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, 0); } @@ -1928,7 +1927,7 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) return false; return true; } @@ -1940,14 +1939,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn) static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; - } else if (!(pfn & nr_pgmask)) { + } else if (pageblock_aligned(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; } else { @@ -1967,7 +1965,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); @@ -1977,7 +1974,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, if (!deferred_pfn_valid(pfn)) { page = NULL; continue; - } else if (!page || !(pfn & nr_pgmask)) { + } else if (!page || pageblock_aligned(pfn)) { page = pfn_to_page(pfn); } else { page++; @@ -6751,7 +6748,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * such that unmovable allocations won't be scattered all * over the place during system boot. */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, migratetype); cond_resched(); } @@ -6794,7 +6791,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fa82faa07dafb..04141a9bea704 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -312,7 +312,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, struct zone *zone; int ret; - VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); + VM_BUG_ON(!pageblock_aligned(boundary_pfn)); if (isolate_before) isolate_pageblock = boundary_pfn - pageblock_nr_pages; From 59accac96363dab40f45c724e6d633a6bd717e78 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 16:26:43 +0800 Subject: [PATCH 126/352] memblock tests: add new pageblock related macro Add new pageblock_start_pfn() and pageblock_align() macro which are needed by memblock tests. Link: https://lkml.kernel.org/r/20220907082643.186979-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/memblock/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 7c2eb5c9bb54d..e65f89b12f1cd 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -22,6 +22,8 @@ enum zone_type { #define pageblock_order (MAX_ORDER - 1) #define pageblock_nr_pages BIT(pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) struct zone { atomic_long_t managed_pages; From 2cd1af58910c7de0eee8369516c3f7e9449cee60 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:35 +0000 Subject: [PATCH 127/352] memcg: extract memcg_vmstats from struct mem_cgroup Patch series "memcg: reduce memory overhead of memory cgroups". Currently a lot of memory is wasted to maintain the vmevents for memory cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be as large as 110. However memcg code uses small portion of those entries. This patch series eliminate this overhead by removing the unneeded vmevent entries from memory cgroup data structures. This patch (of 3): This is a preparatory patch to reduce the memory overhead of memory cgroup. The struct memcg_vmstats is the largest object embedded into the struct mem_cgroup. This patch extracts struct memcg_vmstats from struct mem_cgroup to ease the following patches in reducing the size of struct memcg_vmstats. Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 37 +++---------------------- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 12f63dbb6a8fd..3fd50420d0ace 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,29 +80,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct memcg_vmstats_percpu { - /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct memcg_vmstats { - /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; -}; +struct memcg_vmstats_percpu; +struct memcg_vmstats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; @@ -298,7 +277,7 @@ struct mem_cgroup { CACHELINE_PADDING(_pad1_); /* memory.stat */ - struct memcg_vmstats vmstats; + struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -1001,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 632402001bca1..0a44a733bb03c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,40 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -827,7 +861,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + return READ_ONCE(memcg->vmstats->events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -5170,6 +5204,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -5199,6 +5234,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5418,9 +5457,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5433,15 +5472,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5452,9 +5491,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { From 0c7103d929cf4b544f955afb68991d9e89a77e84 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:36 +0000 Subject: [PATCH 128/352] memcg: rearrange code This is a preparatory patch for easing the review of the follow up patch which will reduce the memory overhead of memory cgroups. Link: https://lkml.kernel.org/r/20220907043537.3457014-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a44a733bb03c..78fd7cfb4f929 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,29 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; @@ -1501,29 +1524,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; From ffc9cfbb7f723a454a848661437e97de3b7e1d74 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:37 +0000 Subject: [PATCH 129/352] memcg: reduce size of memcg vmstats structures The struct memcg_vmstats and struct memcg_vmstats_percpu contains two arrays each for events of size NR_VM_EVENT_ITEMS which can be as large as 110. However the memcg v1 only uses 4 of those while memcg v2 uses 15. The union of both is 17. On a 64 bit system, we are wasting approximately ((110 - 17) * 8 * 2) * (nr_cpus + 1) bytes which is significant on large machines. This patch reduces the size of the given structures by adding one indirection and only stores array of events which are actually used by the memcg code. With this patch, the size of memcg_vmstats has reduced from 2544 bytes to 1056 bytes while the size of memcg_vmstats_percpu has reduced from 2568 bytes to 1080 bytes. Link: https://lkml.kernel.org/r/20220907043537.3457014-4-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 52 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78fd7cfb4f929..986d5a570f167 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -671,6 +671,8 @@ static void flush_memcg_stats_dwork(struct work_struct *w) /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSTEAL_KSWAPD, @@ -692,14 +694,30 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; + unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; @@ -709,11 +727,11 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; + unsigned long events_pending[NR_MEMCG_EVENTS]; }; unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -873,24 +891,34 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats->events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) x += per_cpu(memcg->vmstats_percpu->events[event], cpu); @@ -1564,10 +1592,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -5309,6 +5342,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5477,7 +5511,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + for (i = 0; i < NR_MEMCG_EVENTS; i++) { delta = memcg->vmstats->events_pending[i]; if (delta) memcg->vmstats->events_pending[i] = 0; From db08c2e207a79912eb0fe357164801822ea8290a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Sep 2022 15:35:53 -0700 Subject: [PATCH 130/352] memcg-reduce-size-of-memcg-vmstats-structures-fix fix memcg_events_local() array index, per Shakeel Link: https://lkml.kernel.org/r/CALvZod70Mvxr+Nzb6k0yiU2RFYjTD=0NFhKK-Eyp+5ejd1PSFw@mail.gmail.com Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 986d5a570f167..1f204a2620543 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,7 +921,7 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } From 8458f72e75ca0a0bf180286585c287e1c63e391f Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 17:35:30 +0800 Subject: [PATCH 131/352] mm/hwpoison: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906093530.243262-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/hwpoison-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 65e242b5a4327..d0548e382b6ba 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); -static void pfn_inject_exit(void) +static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; debugfs_remove_recursive(hwpoison_dir); } -static int pfn_inject_init(void) +static int __init pfn_inject_init(void) { hwpoison_dir = debugfs_create_dir("hwpoison", NULL); From c2835a012d68d3c2ab3e2cb3e3844d0f46975bc5 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 15:53:12 +0800 Subject: [PATCH 132/352] mm/rodata_test: use PAGE_ALIGNED() helper Use PAGE_ALIGNED() helper instead of open-coding operation, no functional changes here. Link: https://lkml.kernel.org/r/20220906075312.166595-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/rodata_test.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 2613371945b7e..6d783436951f3 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,13 +9,13 @@ #include #include +#include #include static const int rodata_test_data = 0xC3; void rodata_test(void) { - unsigned long start, end; int zero = 0; /* test 1: read the value */ @@ -39,13 +39,11 @@ void rodata_test(void) } /* test 4: check if the rodata section is PAGE_SIZE aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__start_rodata)) { pr_err("start of .rodata is not page size aligned\n"); return; } - if (end & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__end_rodata)) { pr_err("end of .rodata is not page size aligned\n"); return; } From 91ef45d4cfeacd6a5b87781ff6bcce3c9addf56c Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 8 Sep 2022 19:14:43 +0000 Subject: [PATCH 133/352] mm/damon: introduce struct damos_access_pattern damon_new_scheme() has too many parameters, so introduce struct damos_access_pattern to simplify it. In additon, we can't use a bpf trace kprobe that has more than 5 parameters. Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org Signed-off-by: Yajun Deng Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 37 ++++++++++++++++++---------------- mm/damon/core.c | 31 ++++++++++++++--------------- mm/damon/dbgfs.c | 27 +++++++++++++++---------- mm/damon/lru_sort.c | 46 ++++++++++++++++++++++++++----------------- mm/damon/reclaim.c | 23 +++++++++++++--------- mm/damon/sysfs.c | 17 +++++++++++----- 6 files changed, 106 insertions(+), 75 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d54acec048d6f..90f20675da22a 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -216,13 +216,26 @@ struct damos_stat { }; /** - * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. * @max_sz_region: Maximum size of target regions. * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. + */ +struct damos_access_pattern { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -230,10 +243,8 @@ struct damos_stat { * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the - * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, - * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to - * those. To avoid consuming too much CPU time or IO resources for the - * &action, "a is used. + * &pattern and applies &action to those. To avoid consuming too much + * CPU time or IO resources for the &action, "a is used. * * To do the work only when needed, schemes can be activated for specific * system situations using &wmarks. If all schemes that registered to the @@ -248,12 +259,7 @@ struct damos_stat { * &action is applied. */ struct damos { - unsigned long min_sz_region; - unsigned long max_sz_region; - unsigned int min_nr_accesses; - unsigned int max_nr_accesses; - unsigned int min_age_region; - unsigned int max_age_region; + struct damos_access_pattern pattern; enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; @@ -509,12 +515,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks); +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 5e00c04ceef04..bae41990f4227 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -231,24 +231,21 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks) +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) { struct damos *scheme; scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->min_sz_region = min_sz_region; - scheme->max_sz_region = max_sz_region; - scheme->min_nr_accesses = min_nr_accesses; - scheme->max_nr_accesses = max_nr_accesses; - scheme->min_age_region = min_age_region; - scheme->max_age_region = max_age_region; + scheme->pattern.min_sz_region = pattern->min_sz_region; + scheme->pattern.max_sz_region = pattern->max_sz_region; + scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; + scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; + scheme->pattern.min_age_region = pattern->min_age_region; + scheme->pattern.max_age_region = pattern->max_age_region; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -667,10 +664,12 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) unsigned long sz; sz = r->ar.end - r->ar.start; - return s->min_sz_region <= sz && sz <= s->max_sz_region && - s->min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->max_nr_accesses && - s->min_age_region <= r->age && r->age <= s->max_age_region; + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; } static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 652a94deafe35..1422037cedd2b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -131,9 +131,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->min_sz_region, s->max_sz_region, - s->min_nr_accesses, s->max_nr_accesses, - s->min_age_region, s->max_age_region, + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, @@ -221,8 +224,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, struct damos *scheme, **schemes; const int max_nr_schemes = 256; int pos = 0, parsed, ret; - unsigned long min_sz, max_sz; - unsigned int min_nr_a, max_nr_a, min_age, max_age; unsigned int action_input; enum damos_action action; @@ -233,13 +234,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; struct damos_quota quota = {}; struct damos_watermarks wmarks; ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action_input, "a.ms, + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -251,7 +257,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if ((int)action < 0) goto fail; - if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) goto fail; if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || @@ -259,8 +267,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9de6f00a71c5d..0184ed4828b7e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -293,6 +293,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and accessed for more than the threshold */ + .min_nr_accesses = hot_thres, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -313,26 +324,31 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .weight_nr_accesses = 1, .weight_age = 0, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and accessed for more than the threshold */ - hot_thres, UINT_MAX, - /* no matter its age */ - 0, UINT_MAX, + + return damon_new_scheme( + &pattern, /* prioritize those on LRU lists, as soon as found */ DAMOS_LRU_PRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = cold_thres, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -354,21 +370,15 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .weight_nr_accesses = 0, .weight_age = 1, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for cold_thres or more micro-seconds, and */ - cold_thres, UINT_MAX, + + return damon_new_scheme( + &pattern, /* mark those as not accessed, as soon as found */ DAMOS_LRU_DEPRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a7faf51b4bd4a..5aeca0b9e88ec 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -264,6 +264,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) static struct damos *damon_reclaim_new_scheme(void) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / aggr_interval, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -284,21 +295,15 @@ static struct damos *damon_reclaim_new_scheme(void) .weight_nr_accesses = 0, .weight_age = 1 }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for min_age or more micro-seconds, and */ - min_age / aggr_interval, UINT_MAX, + + return damon_new_scheme( + &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_reclaim_apply_parameters(void) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 91c678e7f6897..9fbde9910a568 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2259,11 +2259,20 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { - struct damon_sysfs_access_pattern *pattern = + struct damon_sysfs_access_pattern *access_pattern = sysfs_scheme->access_pattern; struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; struct damos_quota quota = { .ms = sysfs_quotas->ms, .sz = sysfs_quotas->sz, @@ -2280,10 +2289,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - return damon_new_scheme(pattern->sz->min, pattern->sz->max, - pattern->nr_accesses->min, pattern->nr_accesses->max, - pattern->age->min, pattern->age->max, - sysfs_scheme->action, "a, &wmarks); + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); } static int damon_sysfs_set_schemes(struct damon_ctx *ctx, From dce45bc5d6f8fb8e84a44a2f9315a1e5bd76346e Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 8 Sep 2022 11:13:17 +0800 Subject: [PATCH 134/352] mm/damon/vaddr: add a comment for 'default' case in damon_va_apply_scheme() The switch case 'DAMOS_STAT' and switch case 'default' have same return value in damon_va_apply_scheme(), and the 'default' case is for DAMOS actions that not supported by 'vaddr'. It might make sense to add a comment here. Link: https://lkml.kernel.org/r/1662606797-23534-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 349b44d699e2a..ef8983a3cf0f3 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -658,6 +658,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_STAT: return 0; default: + /* + * DAMOS actions that not yet supported by 'vaddr'. + */ return 0; } From 9b0477a1807b7f34b97f353ab342bfaf3d374776 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 8 Sep 2022 15:31:21 -0700 Subject: [PATCH 135/352] mm-damon-vaddr-add-a-comment-for-default-case-in-damon_va_apply_scheme-fix fx comment grammar Cc: Kaixu Xia Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ef8983a3cf0f3..c2c08c1b316bd 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -659,7 +659,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, return 0; default: /* - * DAMOS actions that not yet supported by 'vaddr'. + * DAMOS actions that are not yet supported by 'vaddr'. */ return 0; } From fdf6aca02eb967aef564b2ddbfb52420cd9f5f5b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Sep 2022 09:00:31 -0400 Subject: [PATCH 136/352] tmpfs: add support for an i_version counter NFSv4 mandates a change attribute to avoid problems with timestamp granularity, which Linux implements using the i_version counter. This is particularly important when the underlying filesystem is fast. Give tmpfs an i_version counter. Since it doesn't have to be persistent, we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion calls in the right places. Also, while there is no formal spec for xattrs, most implementations update the ctime on setxattr. Fix shmem_xattr_handler_set to update the ctime and bump the i_version appropriately. Link: https://lkml.kernel.org/r/20220909130031.15477-1-jlayton@kernel.org Signed-off-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/posix_acl.c | 3 +++ mm/shmem.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e4..efb88a5e59f9e 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include #include #include +#include static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index 3d0b729fcc5ec..275899bacbeaf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "swap.h" static struct vfsmount *shm_mnt; @@ -1030,6 +1031,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1074,6 +1076,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) @@ -1094,7 +1098,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1114,6 +1120,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -2890,6 +2902,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2965,6 +2978,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2982,6 +2996,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3071,6 +3086,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } @@ -3123,6 +3140,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3194,6 +3212,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); return 0; } @@ -3257,9 +3276,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3722,7 +3747,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif From fc7ce5316df4e184dd46cfa92e3224bd6f586206 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:55 +0000 Subject: [PATCH 137/352] selftest/damon: add a test for duplicate context dirs creation Patch series "mm/damon: minor fixes and cleanups". This patchset contains minor fixes and cleanups for DAMON including - selftest for a bug we found before (Patch 1), - fix of region holes in vaddr corner case and a kunit test for it (Patches 2 and 3), and - documents/Kconfig updates for title wordsmithing (Patch 4) and more aggressive DAMON debugfs interface deprecation announcement (Patches 5-7). This patch (of 7): Commit d26f60703606 ("mm/damon/dbgfs: avoid duplicate context directory creation") fixes a bug which could result in memory leak and DAMON disablement. This commit adds a selftest for verifying the fix and avoid regression. Link: https://lkml.kernel.org/r/20220909202901.57977-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220909202901.57977-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../debugfs_duplicate_context_creation.sh | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 0470c5f3e6906..a1fa2eff8192f 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh +TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh new file mode 100644 index 0000000000000..4a76e37ef16b1 --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test duplicated context creation +# ================================ + +if ! echo foo > "$DBGFS/mk_contexts" +then + echo "context creation failed" + exit 1 +fi + +if echo foo > "$DBGFS/mk_contexts" +then + echo "duplicate context creation success" + exit 1 +fi + +if ! echo foo > "$DBGFS/rm_contexts" +then + echo "context deletion failed" + exit 1 +fi + +exit 0 From 6bae576fd296dee35620aee2563394f5190495b5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:56 +0000 Subject: [PATCH 138/352] mm/damon/core: avoid holes in newly set monitoring target ranges When there are two or more non-contiguous regions intersecting with given new ranges, 'damon_set_regions()' does not fill the holes. This commit makes the function to fill the holes with newly created regions. Link: https://lkml.kernel.org/r/20220909202901.57977-3-sj@kernel.org Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: SeongJae Park Reported-by: Yun Levi Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index bae41990f4227..6f6c9c9aca9d4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -168,6 +168,27 @@ static bool damon_intersect(struct damon_region *r, return !(r->ar.end <= re->start || re->end <= r->ar.start); } +/* + * Fill holes in regions with new regions. + */ +static void damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + damon_insert_region(newr, r, next, t); + } + } +} + /* * damon_set_regions() - Set regions of a target for given address ranges. * @t: the given target. @@ -226,6 +247,9 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first->ar.start = ALIGN_DOWN(range->start, DAMON_MIN_REGION); last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + damon_fill_regions_holes(first, last, t); } } return 0; From ad2e1053653a9e13755068e80bccf4aa61c26803 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 21:54:20 +0000 Subject: [PATCH 139/352] mm/damon/core: handle error from 'damon_fill_regions_holes()' Commit 91fc6af21c61 ("mm/damon/core: avoid holes in newly set monitoring target ranges") in mm-unstable tree introduces 'damon_fill_regions_holes()', which does not check failures of 'damon_new_region()' call, so NULL dereferencing is available. This commit fixes the issue by checking failure of the function and returning an error code. Link: https://lkml.kernel.org/r/20220913215420.57761-1-sj@kernel.org Fixes: 91fc6af21c61 ("mm/damon/core: avoid holes in newly set monitoring target ranges") in mm-unstable Signed-off-by: SeongJae Park Reported-by: Coverity Static Analyzer CID 1524904 Signed-off-by: Andrew Morton --- mm/damon/core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 6f6c9c9aca9d4..5ad31d2feae40 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -171,7 +171,7 @@ static bool damon_intersect(struct damon_region *r, /* * Fill holes in regions with new regions. */ -static void damon_fill_regions_holes(struct damon_region *first, +static int damon_fill_regions_holes(struct damon_region *first, struct damon_region *last, struct damon_target *t) { struct damon_region *r = first; @@ -184,9 +184,12 @@ static void damon_fill_regions_holes(struct damon_region *first, next = damon_next_region(r); if (r->ar.end != next->ar.start) { newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; damon_insert_region(newr, r, next, t); } } + return 0; } /* @@ -205,6 +208,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, { struct damon_region *r, *next; unsigned int i; + int err; /* Remove regions which are not in the new ranges */ damon_for_each_region_safe(r, next, t) { @@ -249,7 +253,9 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); /* fill possible holes in the range */ - damon_fill_regions_holes(first, last, t); + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; } } return 0; From 6226471af7d4999e80775065fbd69ca91bcdceca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:57 +0000 Subject: [PATCH 140/352] mm/damon/core-test: test damon_set_regions Preceding commit fixes a bug in 'damon_set_regions()', which allows holes in the new monitoring target ranges. This commit adds a kunit test case for the problem to avoid any regression. Link: https://lkml.kernel.org/r/20220909202901.57977-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 45db79d28fdc3..3db9b73687562 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); } +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_merge_regions_of), KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), {}, }; From 97cbbe52cd51ea5e91a4f3dfb1cad02c0de57e33 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:58 +0000 Subject: [PATCH 141/352] Docs/admin-guide/mm/damon: rename the title of the document The title of the DAMON document for admin-guide, 'Monitoring Data Accesses', could confuse readers in some ways. First of all, DAMON is not the only single way for data access monitoring. And the document is for not only the data access monitoring but also data access pattern based memory management optimizations (DAMOS). This commit updates the title to 'DAMON: Data Access MONitor', which more explicitly explains what the document describes. Link: https://lkml.kernel.org/r/20220909202901.57977-5-sj@kernel.org Fixes: c4ba6014aec3 ("Documentation: add documents for DAMON") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/index.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 05500042f7776..33d37bb2fb4e5 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -======================== -Monitoring Data Accesses -======================== +========================== +DAMON: Data Access MONitor +========================== :doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and From 997c5b46107005d98c30bb745aaa780ec5ab3b72 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:59 +0000 Subject: [PATCH 142/352] mm/damon/Kconfig: notify debugfs deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note to the config menu of DAMON debugfs interface. Link: https://lkml.kernel.org/r/20220909202901.57977-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 66265e3a9c659..7821fcb3f2586 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -68,6 +68,9 @@ config DAMON_DBGFS If unsure, say N. + This will be removed after >5.15.y LTS kernel is released, so users + should move to the sysfs interface (DAMON_SYSFS). + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y From ba3a83d247f12eb9b7af790f0c33aef89b5fe778 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:00 +0000 Subject: [PATCH 143/352] Docs/admin-guide/mm/damon/start: mention the dependency as sysfs instead of debugfs 'Getting Started' document of DAMON says DAMON user-space tool, damo[1], is using DAMON debugfs interface, and therefore it needs to ensure debugfs is mounted. However, the latest version of the tool is using DAMON sysfs interface. Moreover, DAMON debugfs interface is going to be deprecated as announced by commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface"). This commit therefore update the document to tell readers about DAMON sysfs interface dependency instead and never mention about debugfs interface, which will be deprecated. [1] https://github.com/awslabs/damo Link: https://lkml.kernel.org/r/20220909202901.57977-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 4d5ca2c46288a..9f88afc734da4 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at https://github.com/awslabs/damo. The examples below assume that ``damo`` is on your ``$PATH``. It's not mandatory, though. -Because DAMO is using the debugfs interface (refer to :doc:`usage` for the -detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as -below:: - - # mount -t debugfs none /sys/kernel/debug/ - -or append the following line to your ``/etc/fstab`` file so that your system -can automatically mount debugfs upon booting:: - - debugfs /sys/kernel/debug debugfs defaults 0 0 +Because DAMO is using the sysfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure :doc:`sysfs ` is +mounted. Recording Data Access Patterns From 76ce51c233440b7dcfebd2da750d5c694b813e59 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:01 +0000 Subject: [PATCH 144/352] Docs/admin-guide/mm/damon/usage: note DAMON debugfs interface deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note at the beginning of the DAMON debugfs interface usage document. Link: https://lkml.kernel.org/r/20220909202901.57977-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d52f572a90298..c050b882ddc1c 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -393,6 +393,11 @@ the files as above. Above is only for an example. debugfs Interface ================= +.. note:: + + DAMON debugfs interface will be removed after next LTS kernel is released, so + users should move to the :ref:`sysfs interface `. + DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and ``rm_contexts`` under its debugfs directory, ``/damon/``. From c4a6866a45e7185d263bde878b7c66b80e3e10be Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:39:47 +0800 Subject: [PATCH 145/352] mm/shuffle: convert module_param_call to module_param_cb module_param_call is now completely consistent with module_param_cb, so there is no need to keep two macros. Convert module_param_call to module_param_cb since former is obsolete and latter is more kernel-ish. Link: https://lkml.kernel.org/r/20220909083947.3595610-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Kefeng Wang Cc: Liu Shixin Cc: Paul Russel Signed-off-by: Andrew Morton --- mm/shuffle.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/shuffle.c b/mm/shuffle.c index c13c33b247e87..fb1393b8b3a9d 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -12,23 +12,22 @@ DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); static bool shuffle_param; -static int shuffle_show(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N'); -} -static __meminit int shuffle_store(const char *val, +static __meminit int shuffle_param_set(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); - - if (rc < 0) - return rc; - if (shuffle_param) + if (param_set_bool(val, kp)) + return -EINVAL; + if (*(bool *)kp->arg) static_branch_enable(&page_alloc_shuffle_key); return 0; } -module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +static const struct kernel_param_ops shuffle_param_ops = { + .set = shuffle_param_set, + .get = param_get_bool, +}; +module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400); /* * For two pages to be swapped in the shuffle, they must be free (on a From 569f860c07b063cab475cbbf0fdbd561a9c29416 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Fri, 9 Sep 2022 11:37:22 +0300 Subject: [PATCH 146/352] zsmalloc: use correct types in _first_obj_offset functions Since commit ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") we are using page->page_type (unsigned int) field instead of page->units (int) as first object offset in a subpage of zspage. So get_first_obj_offset() and set_first_obj_offset() functions should work with unsigned int type. Link: https://lkml.kernel.org/r/20220909083722.85024-1-avromanov@sberdevices.ru Fixes: ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") Signed-off-by: Alexey Romanov Reviewed-by: Sergey Senozhatsky Cc: Alexey Romanov Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 12eb11e709393..525758713a553 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -472,12 +472,12 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -static inline int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } -static inline void set_first_obj_offset(struct page *page, int offset) +static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } @@ -1592,7 +1592,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - int offset = 0; + unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1846,7 +1846,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset; + unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; From fe8f9e8ec4e76e678920fcb8cd3f950e0e8bf88f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:31:40 +0800 Subject: [PATCH 147/352] mm: kfence: convert to DEFINE_SEQ_ATTRIBUTE Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code. Link: https://lkml.kernel.org/r/20220909083140.3592919-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: Marco Elver Tested-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/kfence/core.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 8c08ae2101d7a..26de62a516652 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -719,24 +719,13 @@ static int show_object(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations object_seqops = { +static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; - -static int open_objects(struct inode *inode, struct file *file) -{ - return seq_open(file, &object_seqops); -} - -static const struct file_operations objects_fops = { - .open = open_objects, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(objects); static int __init kfence_debugfs_init(void) { From 59f83e505f78ff678a374a58bc28801f70a6e744 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 9 Sep 2022 21:36:06 +0000 Subject: [PATCH 148/352] mm/damon: remove duplicate get_monitoring_region() definitions In lru_sort.c and reclaim.c, they are all defining get_monitoring_region() function, there is no need to define it separately. As 'get_monitoring_region()' is not a 'static' function anymore, we try to use a prefix to distinguish with other functions, so there rename it to 'damon_find_biggest_system_ram'. Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ mm/damon/lru_sort.c | 37 ++----------------------------------- mm/damon/reclaim.c | 37 ++----------------------------------- 4 files changed, 46 insertions(+), 70 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 90f20675da22a..016b6c9c03d62 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); + #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 5ad31d2feae40..2437c61b0bc0b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,4 +1245,44 @@ static int kdamond_fn(void *data) return 0; } +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + #include "core-test.h" diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 0184ed4828b7e..8415e18fcf0ef 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -257,39 +257,6 @@ module_param(nr_cold_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_lru_sort_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_lru_sort_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_lru_sort_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -414,8 +381,8 @@ static int damon_lru_sort_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 5aeca0b9e88ec..fe7bc0c55ecb3 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -229,39 +229,6 @@ module_param(nr_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_reclaim_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_reclaim_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_reclaim_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - static struct damos *damon_reclaim_new_scheme(void) { struct damos_access_pattern pattern = { @@ -328,8 +295,8 @@ static int damon_reclaim_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; From b38608deae889d02273b09f014eb2a2c8a70746d Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 9 Sep 2022 07:31:09 +0000 Subject: [PATCH 149/352] mm: use nth_page instead of mem_map_offset mem_map_next To handle the discontiguous case, mem_map_next() has a parameter named `offset`. As a function caller, one would be confused why "get next entry" needs a parameter named "offset". The other drawback of mem_map_next() is that the callers must take care of the map between parameter "iter" and "offset", otherwise we may get an hole or duplication during iteration. So we use nth_page instead of mem_map_next. And replace mem_map_offset with nth_page() per Matthew's comments. Link: https://lkml.kernel.org/r/1662708669-9395-1-git-send-email-lic121@chinatelecom.cn Signed-off-by: Cheng Li Fixes: 69d177c2fc70 ("hugetlbfs: handle pages higher order than MAX_ORDER") Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++++------------ mm/internal.h | 28 ---------------------------- mm/memory.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 51 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6abe7a8d84851..7f9c591e28f5f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1306,12 +1306,13 @@ static void __destroy_compound_gigantic_page(struct page *page, { int i; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; atomic_set(compound_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); p->mapping = NULL; clear_compound_head(p); if (!demote) @@ -1532,7 +1533,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, static void __update_and_free_page(struct hstate *h, struct page *page) { int i; - struct page *subpage = page; + struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1563,8 +1564,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (unlikely(PageHWPoison(page))) hugetlb_clear_page_hwpoison(page); - for (i = 0; i < pages_per_huge_page(h); - i++, subpage = mem_map_next(subpage, page, i)) { + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = nth_page(page, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1771,13 +1772,15 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, { int i, j; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); + /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -1824,14 +1827,16 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, out_error: /* undo tail page modifications made above */ - p = page + 1; - for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { + for (j = 1; j < i; j++) { + p = nth_page(page, j); clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + for (; j < nr_pages; j++) { + p = nth_page(page, j); __ClearPageReserved(p); + } set_compound_order(page, 0); #ifdef CONFIG_64BIT page[1].compound_nr = 0; @@ -6128,7 +6133,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, for (nr = 0; nr < refs; nr++) { if (likely(pages)) - pages[nr] = mem_map_offset(page, nr); + pages[nr] = nth_page(page, nr); if (vmas) vmas[nr] = vma; } @@ -6292,7 +6297,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) - record_subpages_vmas(mem_map_offset(page, pfn_offset), + record_subpages_vmas(nth_page(page, pfn_offset), vma, refs, likely(pages) ? pages + i : NULL, vmas ? vmas + i : NULL); diff --git a/mm/internal.h b/mm/internal.h index 971e09ddf3518..e5b2d40991816 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -638,34 +638,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) } #endif /* !CONFIG_MMU */ -/* - * Return the mem_map entry representing the 'offset' subpage within - * the maximally aligned gigantic page 'base'. Handle any discontiguity - * in the mem_map at MAX_ORDER_NR_PAGES boundaries. - */ -static inline struct page *mem_map_offset(struct page *base, int offset) -{ - if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return nth_page(base, offset); - return base + offset; -} - -/* - * Iterator over all subpages within the maximally aligned gigantic - * page 'base'. Handle any discontiguity in the mem_map. - */ -static inline struct page *mem_map_next(struct page *iter, - struct page *base, int offset) -{ - if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { - unsigned long pfn = page_to_pfn(base) + offset; - if (!pfn_valid(pfn)) - return NULL; - return pfn_to_page(pfn); - } - return iter + 1; -} - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/memory.c b/mm/memory.c index d671ad367d677..c01c12500169d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5690,11 +5690,11 @@ static void clear_gigantic_page(struct page *page, unsigned int pages_per_huge_page) { int i; - struct page *p = page; + struct page *p; might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + p = nth_page(page, i); cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } @@ -5730,13 +5730,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, struct page *dst_base = dst; struct page *src_base = src; - for (i = 0; i < pages_per_huge_page; ) { + for (i = 0; i < pages_per_huge_page; i++) { + dst = nth_page(dst_base, i); + src = nth_page(src_base, i); + cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); } } @@ -5783,10 +5782,10 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; - struct page *subpage = dst_page; + struct page *subpage; - for (i = 0; i < pages_per_huge_page; - i++, subpage = mem_map_next(subpage, dst_page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + subpage = nth_page(dst_page, i); if (allow_pagefault) page_kaddr = kmap(subpage); else From 754e1b0e3388f6373b66e10f3115ca1a872c9b90 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 9 Sep 2022 10:57:11 +0800 Subject: [PATCH 150/352] writeback: remove unused macro DIRTY_FULL_SCOPE It's introduced but never used. Remove it. Link: https://lkml.kernel.org/r/20220909025711.32012-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Jan Kara Acked-by: Jens Axboe Cc: Bart Van Assche Cc: David Howells Cc: Matthew Wilcox Cc: NeilBrown Cc: Vlastimil Babka Cc: zhanglianjie Signed-off-by: Andrew Morton --- include/linux/writeback.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f045f6d6c4f0..06f9291b6fd51 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -17,20 +17,12 @@ struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* - * The 1/4 region under the global dirty thresh is for smooth dirty throttling: - * - * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) - * - * Further beyond, all dirtier tasks will enter a loop waiting (possibly long - * time) for the dirty pages to drop, unless written enough pages. - * * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 -#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) struct backing_dev_info; From c41447b216a4c355ebf7584a1d11c833f1ddf1e2 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 10:16:53 +0800 Subject: [PATCH 151/352] mm/huge_memory: prevent THP_ZERO_PAGE_ALLOC increased twice A user who reads THP_ZERO_PAGE_ALLOC may be more concerned about the huge zero pages that are really allocated for thp. It is misleading to increase THP_ZERO_PAGE_ALLOC twice if two threads call get_huge_zero_page concurrently. Don't increase the value if the huge page is not really used. Update Documentation/admin-guide/mm/transhuge.rst to suit. Link: https://lkml.kernel.org/r/20220909021653.3371879-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Alexander Potapenko Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 7 +++---- mm/huge_memory.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index c9c37f16eef88..8e3418ec4503e 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -366,10 +366,9 @@ thp_split_pmd page table entry. thp_zero_page_alloc - is incremented every time a huge zero page is - successfully allocated. It includes allocations which where - dropped due race with other allocation. Note, it doesn't count - every map of the huge zero page, only its allocation. + is incremented every time a huge zero page used for thp is + successfully allocated. Note, it doesn't count every map of + the huge zero page, only its allocation. thp_zero_page_alloc_failed is incremented if kernel fails to allocate diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36ef79b851958..4938defe4e732 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,7 +163,6 @@ static bool get_huge_zero_page(void) count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } - count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); @@ -175,6 +174,7 @@ static bool get_huge_zero_page(void) /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } From 9419f69f05fb0f18a801c6dc2bb408ae9c0017bb Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 8 Sep 2022 16:19:32 +0800 Subject: [PATCH 152/352] mm/damon/sysfs: change few functions execute order There's no need to run container_of() as early as we do. The compiler figures this out, but the resulting code is more readable. Link: https://lkml.kernel.org/r/20220908081932.77370-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 9fbde9910a568..50cc6cdcb5d67 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1031,8 +1031,7 @@ static ssize_t nr_schemes_show(struct kobject *kobj, static ssize_t nr_schemes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); + struct damon_sysfs_schemes *schemes; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1040,6 +1039,8 @@ static ssize_t nr_schemes_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_schemes_add_dirs(schemes, nr); @@ -1237,8 +1238,7 @@ static ssize_t nr_regions_show(struct kobject *kobj, static ssize_t nr_regions_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_regions *regions = container_of(kobj, - struct damon_sysfs_regions, kobj); + struct damon_sysfs_regions *regions; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1246,6 +1246,8 @@ static ssize_t nr_regions_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_regions_add_dirs(regions, nr); @@ -1440,8 +1442,7 @@ static ssize_t nr_targets_show(struct kobject *kobj, static ssize_t nr_targets_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_targets *targets = container_of(kobj, - struct damon_sysfs_targets, kobj); + struct damon_sysfs_targets *targets; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1449,6 +1450,8 @@ static ssize_t nr_targets_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_targets_add_dirs(targets, nr); @@ -1962,8 +1965,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj, static ssize_t nr_contexts_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_contexts *contexts = container_of(kobj, - struct damon_sysfs_contexts, kobj); + struct damon_sysfs_contexts *contexts; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -1973,6 +1975,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj, if (nr < 0 || 1 < nr) return -EINVAL; + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_contexts_add_dirs(contexts, nr); @@ -2737,8 +2740,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj, static ssize_t nr_kdamonds_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, - struct damon_sysfs_kdamonds, kobj); + struct damon_sysfs_kdamonds *kdamonds; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -2747,6 +2749,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); From d19cf14c30c822cd0bb72aa02a3c12d1a08a828f Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 12 Sep 2022 23:11:53 +0800 Subject: [PATCH 153/352] mm/damon/sysfs: use the wrapper directly to check if the kdamond is running We can use the 'damon_sysfs_kdamond_running()' wrapper directly to check if the kdamond is running in 'damon_sysfs_turn_damon_on()'. Link: https://lkml.kernel.org/r/1662995513-24489-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 50cc6cdcb5d67..364699309c9b2 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2465,8 +2465,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) struct damon_ctx *ctx; int err; - if (kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx)) + if (damon_sysfs_kdamond_running(kdamond)) return -EBUSY; if (damon_sysfs_cmd_request.kdamond == kdamond) return -EBUSY; From efa045dec783d4f1159c2448c5f4937e4dd2a82e Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 12 Sep 2022 22:39:03 +0800 Subject: [PATCH 154/352] mm/damon: improve damon_new_region strategy Kdamond is implemented as a periodical split-merge pattern, which will create and destroy regions possibly at high frequency (hundreds or even thousands of per sec), depending on the number of regions and aggregation period. In that case, kmalloc and kfree could bring speed and space overheads, which can be improved by using a private kmem cache. Link: https://lkml.kernel.org/r/TYCP286MB2323DA1894FA55BB9CF90978CA449@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Dawei Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 2437c61b0bc0b..d7fad24880413 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -29,6 +29,8 @@ static bool running_exclusive_ctxs; static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; +static struct kmem_cache *damon_region_cache __ro_after_init; + /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ static bool __damon_is_registered_ops(enum damon_ops_id id) { @@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; - region = kmalloc(sizeof(*region), GFP_KERNEL); + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; @@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t) static void damon_free_region(struct damon_region *r) { - kfree(r); + kmem_cache_free(damon_region_cache, r); } void damon_destroy_region(struct damon_region *r, struct damon_target *t) @@ -1285,4 +1287,18 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +static int __init damon_init(void) +{ + damon_region_cache = kmem_cache_create("damon_region_cache", sizeof(struct damon_region), + 0, 0, NULL); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + #include "core-test.h" From 5a647e82366f62ee7bdc2895e0f94f211532c497 Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Wed, 14 Sep 2022 00:21:58 +0800 Subject: [PATCH 155/352] mm/damon: creating kmem cache for damon regions by KMEM_CACHE() Damon regions are dynamic objects which can be created and destroyed frequently, a dedicated slab cache is created by KMEM_CACHE(), as suggested by akpm. Link: https://lkml.kernel.org/r/Message-ID: Signed-off-by: Dawei Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index d7fad24880413..c9ec2de845b32 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1289,8 +1289,7 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) static int __init damon_init(void) { - damon_region_cache = kmem_cache_create("damon_region_cache", sizeof(struct damon_region), - 0, 0, NULL); + damon_region_cache = KMEM_CACHE(damon_region, 0); if (unlikely(!damon_region_cache)) { pr_err("creating damon_region_cache fails\n"); return -ENOMEM; From 9348cd60925a21449bf1d20a85a7b4c54dc2e235 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Sep 2022 00:27:44 +0900 Subject: [PATCH 156/352] zram: do not waste zram_table_entry flags bits zram_table_entry::flags stores object size in the lower bits and zram pageflags in the upper bits. However, for some reason, we use 24 lower bits, while maximum zram object size is PAGE_SIZE, which requires PAGE_SHIFT bits (up to 16 on arm64). This wastes 24 - PAGE_SHIFT bits that we can use for additional zram pageflags instead. Also add a BUILD_BUG_ON() to alert us should we run out of bits in zram_table_entry::flags. Link: https://lkml.kernel.org/r/20220912152744.527438-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ drivers/block/zram/zram_drv.h | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 607f4634c27da..eb021db21ddfb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2131,6 +2131,8 @@ static int __init zram_init(void) { int ret; + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); if (ret < 0) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828f..a2bda53020fdd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,16 +30,15 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.flags is for - * object size (excluding header), the higher bits is for - * zram_pageflags. + * ZRAM is mainly used for memory efficiency so we want to keep memory + * footprint small and thus squeeze size and zram pageflags into a flags + * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding + * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT + * bits), the higher bits are for zram_pageflags. * - * zram is mainly used for memory efficiency so we want to keep memory - * footprint small so we can squeeze size and flags into a field. - * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), - * the higher bits is for zram_pageflags. + * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ -#define ZRAM_FLAG_SHIFT 24 +#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { From e0d5eea3f207cf83bc0d65cb1c1fc601258daab9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 14 Sep 2022 14:20:33 +0900 Subject: [PATCH 157/352] zram: keep comments within 80-columns limit Several trivial fixups (that I should have spotted during review). Link: https://lkml.kernel.org/r/20220914052033.838050-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index eb021db21ddfb..43eeef2b9fbe2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -329,8 +329,8 @@ static ssize_t idle_store(struct device *dev, if (!sysfs_streq(buf, "all")) { /* - * If it did not parse as 'all' try to treat it as an integer when - * we have memory tracking enabled. + * If it did not parse as 'all' try to treat it as an integer + * when we have memory tracking enabled. */ u64 age_sec; @@ -345,7 +345,10 @@ static ssize_t idle_store(struct device *dev, if (!init_done(zram)) goto out_unlock; - /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + /* + * A cutoff_time of 0 marks everything as idle, this is the + * "all" behavior. + */ mark_idle(zram, cutoff_time); rv = len; @@ -1416,11 +1419,11 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (comp_len != PAGE_SIZE) goto compress_again; /* - * If the page is not compressible, you need to acquire the lock and - * execute the code below. The zcomp_stream_get() call is needed to - * disable the cpu hotplug and grab the zstrm buffer back. - * It is necessary that the dereferencing of the zstrm variable below - * occurs correctly. + * If the page is not compressible, you need to acquire the + * lock and execute the code below. The zcomp_stream_get() + * call is needed to disable the cpu hotplug and grab the + * zstrm buffer back. It is necessary that the dereferencing + * of the zstrm variable below occurs correctly. */ zstrm = zcomp_stream_get(zram->comp); } From 8f8f5a7f1a72f732c0d7117205056a1f2fc5f880 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Sun, 11 Sep 2022 08:59:17 +0800 Subject: [PATCH 158/352] mm/damon: simplify scheme create in damon_lru_sort_apply_parameters In damon_lru_sort_apply_parameters(), we can use damon_set_schemes() to replace the way of creating the first 'scheme' in original code, this makes the code look cleaner. Link: https://lkml.kernel.org/r/20220911005917.835-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8415e18fcf0ef..307ba71adcfa9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,7 +350,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme, *next_scheme; + struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -360,17 +360,15 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* free previously set schemes */ - damon_for_each_scheme_safe(scheme, next_scheme, ctx) - damon_destroy_scheme(scheme); - /* aggr_interval / sample_interval is the maximum nr_accesses */ hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - damon_add_scheme(ctx, scheme); + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + return err; cold_thres = cold_min_age / aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); From 0bfc31e04508d0dd738dca8e692752703b274e36 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:28 +0000 Subject: [PATCH 159/352] mm/damon/paddr: make supported DAMOS actions of paddr clear Patch series "mm/damon: cleanup code". DAMON code was not so clean from the beginning, but it has been too much nowadays, especially due to the duplicates in DAMON_RECLAIM and DAMON_LRU_SORT. This patchset cleans some of the mess. This patch (of 22): The 'switch-case' statement in 'damon_va_apply_scheme()' function provides a 'case' for every supported DAMOS action while all not-yet-supported DAMOS actions fall through the 'default' case, and comment it so that people can easily know which actions are supported. Its counterpart in 'paddr', 'damon_pa_apply_scheme()', however, doesn't. This commit makes the 'paddr' side function follows the pattern of 'vaddr' for better readability and consistency. Link: https://lkml.kernel.org/r/20220913174449.50645-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220913174449.50645-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6b0d9e6aa6770..219127cb49e2e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -275,7 +275,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_mark_accessed(r); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r); + case DAMOS_STAT: + break; default: + /* DAMOS actions that not yet supported by 'paddr'. */ break; } return 0; From 57dfab401c86b6bc67572765b2ae576b730090b9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:29 +0000 Subject: [PATCH 160/352] mm/damon/paddr: deduplicate damon_pa_{mark_accessed,deactivate_pages}() The bodies of damon_pa_{mark_accessed,deactivate_pages}() contains duplicates. This commit factors out the common part to a separate function and removes the duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 219127cb49e2e..1ada62db68b13 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -232,7 +232,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, bool mark_accessed) { unsigned long addr, applied = 0; @@ -241,27 +242,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) if (!page) continue; - mark_page_accessed(page); + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); put_page(page); applied++; } return applied * PAGE_SIZE; } -static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +static unsigned long damon_pa_mark_accessed(struct damon_region *r) { - unsigned long addr, applied = 0; - - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); + return damon_pa_mark_accessed_or_deactivate(r, true); +} - if (!page) - continue; - deactivate_page(page); - put_page(page); - applied++; - } - return applied * PAGE_SIZE; +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, From b2e03bfb2e9161c66e87dc94c6afe079c701b71d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:30 +0000 Subject: [PATCH 161/352] mm/damon/core: copy struct-to-struct instead of field-to-field in damon_new_scheme() The function for new 'struct damos' creation, 'damon_new_scheme()', copies each field of the struct one by one, though it could simply copied via struct to struct. This commit replaces the unnecessarily verbose field-to-field copies with struct-to-struct copies to make code simple and short. Link: https://lkml.kernel.org/r/20220913174449.50645-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c9ec2de845b32..a564f83e9efe7 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -272,22 +272,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->pattern.min_sz_region = pattern->min_sz_region; - scheme->pattern.max_sz_region = pattern->max_sz_region; - scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; - scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; - scheme->pattern.min_age_region = pattern->min_age_region; - scheme->pattern.max_age_region = pattern->max_age_region; + scheme->pattern = *pattern; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota.ms = quota->ms; - scheme->quota.sz = quota->sz; - scheme->quota.reset_interval = quota->reset_interval; - scheme->quota.weight_sz = quota->weight_sz; - scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; - scheme->quota.weight_age = quota->weight_age; + scheme->quota = *quota; + /* caller might not zero-initialized the private fileds */ scheme->quota.total_charged_sz = 0; scheme->quota.total_charged_ns = 0; scheme->quota.esz = 0; @@ -296,11 +287,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->quota.charge_target_from = NULL; scheme->quota.charge_addr_from = 0; - scheme->wmarks.metric = wmarks->metric; - scheme->wmarks.interval = wmarks->interval; - scheme->wmarks.high = wmarks->high; - scheme->wmarks.mid = wmarks->mid; - scheme->wmarks.low = wmarks->low; + scheme->wmarks = *wmarks; scheme->wmarks.activated = true; return scheme; From e2a8c0f9cc1c48c49083018ea7fd5f0068a600d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:31 +0000 Subject: [PATCH 162/352] mm/damon/core: factor out 'damos_quota' private fileds initialization The 'struct damos' creation function, 'damon_new_scheme()', does initialization of private fileds of 'struct damos_quota' in it. As its verbose and makes the function unnecessarily long, this commit factors it out to separate function. Link: https://lkml.kernel.org/r/20220913174449.50645-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index a564f83e9efe7..6d9f4c2dee35c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -263,6 +263,19 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks) @@ -277,15 +290,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota = *quota; - /* caller might not zero-initialized the private fileds */ - scheme->quota.total_charged_sz = 0; - scheme->quota.total_charged_ns = 0; - scheme->quota.esz = 0; - scheme->quota.charged_sz = 0; - scheme->quota.charged_from = 0; - scheme->quota.charge_target_from = NULL; - scheme->quota.charge_addr_from = 0; + scheme->quota = *(damos_quota_init_priv(quota)); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; From e4dd5c9976629e44e2398674b20fe591698dcf3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:32 +0000 Subject: [PATCH 163/352] mm/damon/core: use a dedicated struct for monitoring attributes DAMON monitoring attributes are directly defined as fields of 'struct damon_ctx'. This makes 'struct damon_ctx' a little long and complicated. This commit defines and uses a struct, 'struct damon_attrs', which is dedicated for only the monitoring attributes to make the purpose of the five values clearer and simplify 'struct damon_ctx'. Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++++++++---------- mm/damon/core.c | 34 +++++++++++++++++----------------- mm/damon/dbgfs.c | 6 +++--- mm/damon/ops-common.c | 4 ++-- mm/damon/vaddr.c | 4 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 016b6c9c03d62..2ceee8b07726b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -389,13 +389,15 @@ struct damon_callback { }; /** - * struct damon_ctx - Represents a context for each monitoring. This is the - * main interface that allows users to set the attributes and get the results - * of the monitoring. + * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @min_nr_regions: The minimum number of adaptive monitoring + * regions. + * @max_nr_regions: The maximum number of adaptive monitoring + * regions. * * For each @sample_interval, DAMON checks whether each region is accessed or * not. It aggregates and keeps the access information (number of accesses to @@ -405,7 +407,21 @@ struct damon_callback { * @ops_update_interval. All time intervals are in micro-seconds. * Please refer to &struct damon_operations and &struct damon_callback for more * detail. + */ +struct damon_attrs { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long ops_update_interval; + unsigned long min_nr_regions; + unsigned long max_nr_regions; +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. * + * @attrs: Monitoring attributes for accuracy/overhead control. * @kdamond: Kernel thread who does the monitoring. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * @@ -427,15 +443,11 @@ struct damon_callback { * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @min_nr_regions: The minimum number of adaptive monitoring regions. - * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { - unsigned long sample_interval; - unsigned long aggr_interval; - unsigned long ops_update_interval; + struct damon_attrs attrs; /* private: internal use only */ struct timespec64 last_aggregation; @@ -448,8 +460,6 @@ struct damon_ctx { struct damon_operations ops; struct damon_callback callback; - unsigned long min_nr_regions; - unsigned long max_nr_regions; struct list_head adaptive_targets; struct list_head schemes; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6d9f4c2dee35c..bbd4c2d991dda 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -382,17 +382,17 @@ struct damon_ctx *damon_new_ctx(void) if (!ctx) return NULL; - ctx->sample_interval = 5 * 1000; - ctx->aggr_interval = 100 * 1000; - ctx->ops_update_interval = 60 * 1000 * 1000; + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); - ctx->min_nr_regions = 10; - ctx->max_nr_regions = 1000; + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -448,11 +448,11 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, if (min_nr_reg > max_nr_reg) return -EINVAL; - ctx->sample_interval = sample_int; - ctx->aggr_interval = aggr_int; - ctx->ops_update_interval = ops_upd_int; - ctx->min_nr_regions = min_nr_reg; - ctx->max_nr_regions = max_nr_reg; + ctx->attrs.sample_interval = sample_int; + ctx->attrs.aggr_interval = aggr_int; + ctx->attrs.ops_update_interval = ops_upd_int; + ctx->attrs.min_nr_regions = min_nr_reg; + ctx->attrs.max_nr_regions = max_nr_reg; return 0; } @@ -507,8 +507,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) sz += r->ar.end - r->ar.start; } - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; @@ -657,7 +657,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline, static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->aggr_interval); + ctx->attrs.aggr_interval); } /* @@ -1016,12 +1016,12 @@ static void kdamond_split_regions(struct damon_ctx *ctx) damon_for_each_target(t, ctx) nr_regions += damon_nr_regions(t); - if (nr_regions > ctx->max_nr_regions / 2) + if (nr_regions > ctx->attrs.max_nr_regions / 2) return; /* Maybe the middle of the region has different access frequency */ if (last_nr_regions == nr_regions && - nr_regions < ctx->max_nr_regions / 3) + nr_regions < ctx->attrs.max_nr_regions / 3) nr_subregions = 3; damon_for_each_target(t, ctx) @@ -1039,7 +1039,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) static bool kdamond_need_update_operations(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->ops_update_interval); + ctx->attrs.ops_update_interval); } /* @@ -1188,7 +1188,7 @@ static int kdamond_fn(void *data) continue; } - kdamond_usleep(ctx->sample_interval); + kdamond_usleep(ctx->attrs.sample_interval); if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1422037cedd2b..74e7542af6d34 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->sample_interval, ctx->aggr_interval, - ctx->ops_update_interval, ctx->min_nr_regions, - ctx->max_nr_regions); + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); mutex_unlock(&ctx->kdamond_lock); return simple_read_from_buffer(buf, count, ppos, kbuf, ret); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index f599838b5f648..9310df72e1c54 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -99,10 +99,10 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->aggr_interval / c->sample_interval; + max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; age_in_log++, age_in_sec >>= 1) ; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c2c08c1b316bd..0eae47bd9ccbb 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -251,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; From e102813bfd18d7ea5986913d6b5c94a6674f271a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:33 +0000 Subject: [PATCH 164/352] mm/damon/core: reduce parameters for damon_set_attrs() Number of parameters for 'damon_set_attrs()' is six. As it could be confusing and verbose, this commit reduces the number by receiving single pointer to a 'struct damon_attrs'. Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- mm/damon/core.c | 21 +++++---------------- mm/damon/dbgfs.c | 9 ++++++--- mm/damon/lru_sort.c | 10 ++++++++-- mm/damon/reclaim.c | 10 ++++++++-- mm/damon/sysfs.c | 12 ++++++++---- 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2ceee8b07726b..c5dc0c77c7722 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); diff --git a/mm/damon/core.c b/mm/damon/core.c index bbd4c2d991dda..29635a82cb691 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -428,32 +428,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx) /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context - * @sample_int: time interval between samplings - * @aggr_int: time interval between aggregations - * @ops_upd_int: time interval between monitoring operations updates - * @min_nr_reg: minimal number of regions - * @max_nr_reg: maximum number of regions + * @attrs: monitoring attributes * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg) +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { - if (min_nr_reg < 3) + if (attrs->min_nr_regions < 3) return -EINVAL; - if (min_nr_reg > max_nr_reg) + if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; - ctx->attrs.sample_interval = sample_int; - ctx->attrs.aggr_interval = aggr_int; - ctx->attrs.ops_update_interval = ops_upd_int; - ctx->attrs.min_nr_regions = min_nr_reg; - ctx->attrs.max_nr_regions = max_nr_reg; - + ctx->attrs = *attrs; return 0; } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 74e7542af6d34..c00eba4448d85 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - unsigned long s, a, r, minr, maxr; + struct damon_attrs attrs; char *kbuf; ssize_t ret; @@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file, return PTR_ERR(kbuf); if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &s, &a, &r, &minr, &maxr) != 5) { + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { ret = -EINVAL; goto out; } @@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + ret = damon_set_attrs(ctx, &attrs); if (!ret) ret = count; unlock_out: diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 307ba71adcfa9..6d5f83965276f 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,13 +350,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index fe7bc0c55ecb3..bc841efbab45e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -275,12 +275,18 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 364699309c9b2..89f9fa3afd1f8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2130,10 +2130,14 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; - - return damon_set_attrs(ctx, sys_intervals->sample_us, - sys_intervals->aggr_us, sys_intervals->update_us, - sys_nr_regions->min, sys_nr_regions->max); + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); } static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) From 227172dfaf615a4d9df701d3075b2ff8398e8969 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:34 +0000 Subject: [PATCH 165/352] mm/damon/reclaim: use 'struct damon_attrs' for storing parameters for it DAMON_RECLAIM receives monitoring attributes by parameters one by one to separate variables, and then combine those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc841efbab45e..d35a00d8dde2d 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -129,14 +129,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the cold memory monitoring. Please refer * to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -144,8 +152,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the cold memory monitoring. Please * refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -155,8 +163,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -166,8 +174,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -239,7 +247,8 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / aggr_interval, + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; struct damos_watermarks wmarks = { @@ -275,18 +284,11 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); if (err) return err; From ca054df314f653a1e8a2523c9d0dff449ad6ac5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:35 +0000 Subject: [PATCH 166/352] mm/damon/lru_sort: use 'struct damon_attrs' for storing parameters for it DAMON_LRU_SORT receives monitoring attributes by parameters one by one to separate variables, and then combines those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 6d5f83965276f..ade985b836527 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -127,14 +127,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the hot/cold memory monitoring. Please * refer to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -142,8 +150,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the hot/cold memory monitoring. * Please refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -153,8 +161,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -164,8 +172,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -350,25 +358,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) return err; /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / - 1000; + hot_thres = damon_lru_sort_mon_attrs.aggr_interval / + damon_lru_sort_mon_attrs.sample_interval * + hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; @@ -376,7 +378,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - cold_thres = cold_min_age / aggr_interval; + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!scheme) return -ENOMEM; From d50d1e5fdcda49e811e8691fac5333ea3c5b3b4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:36 +0000 Subject: [PATCH 167/352] mm/damon: implement a monitoring attributes module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for monitoring attributes that having same names. This commot implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 mm/damon/modules-common.h diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 0000000000000..0abd0636bc649 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); From c3b4863564cd41c312562e945c86ee0c34a45346 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:37 +0000 Subject: [PATCH 168/352] mm/damon/lru_sort: use monitoring attributes parameters generaotr macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 47 +++++---------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index ade985b836527..e95626acee6f9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -128,52 +130,13 @@ static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_lru_sort_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the hot/cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the hot/cold memory monitoring. - * Please refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); /* * Start of the target memory region in physical address. From e1cb0e8cb15dc5bba61539c655f44177f090cce7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:38 +0000 Subject: [PATCH 169/352] mm/damon/reclaim: use monitoring attributes parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 47 +++++----------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index d35a00d8dde2d..48326bef20f51 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -130,52 +132,13 @@ static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_reclaim_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the cold memory monitoring. Please refer - * to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); /* * Start of the target memory region in physical address. From c4afd2a819ac9ae251aa4a113fb8ab3e4e715427 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:39 +0000 Subject: [PATCH 170/352] mm/damon/modules-common: implement a watermarks module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for watermarks that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 0abd0636bc649..1370590a37d18 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -16,3 +16,10 @@ 0600); \ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks->interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.lowulong, 0600); From 4484879039b23068fb9df01a6ca2abf2a92fc19c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:40 +0000 Subject: [PATCH 171/352] mm/damon/lru_sort: use watermarks parameters generator macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 64 ++++++--------------------------------- mm/damon/modules-common.h | 4 +-- 2 files changed, 12 insertions(+), 56 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index e95626acee6f9..20760b39b50a4 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -90,44 +90,14 @@ module_param(quota_ms, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically - * checks the watermarks. 200 (20%) by default. - */ -static unsigned long wmarks_high __read_mostly = 200; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring - * and the LRU-lists sorting. 150 (15%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 150; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks - * the watermarks. 50 (5%) by default. - */ -static unsigned long wmarks_low __read_mostly = 50; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); static struct damon_attrs damon_lru_sort_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -242,13 +212,6 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of hot pages for more than half @@ -270,7 +233,7 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -287,13 +250,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of cold pages for more than @@ -316,7 +272,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 1370590a37d18..4c2ce84869d58 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -18,8 +18,8 @@ 0600); #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ - module_param_named(wmarks_interval, wmarks->interval, ulong, \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ - module_param_named(wmarks_low, wmarks.lowulong, 0600); + module_param_named(wmarks_low, wmarks.low, ulong, 0600); From c8ba9ccbdc00eed2d21febd41033ef3cd15a602d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:41 +0000 Subject: [PATCH 172/352] mm/damon/reclaim: use watermarks parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 56 ++++++++-------------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 48326bef20f51..7f845f617dc56 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -91,45 +91,14 @@ module_param(quota_sz, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically - * checks the watermarks. 500 (50%) by default. - */ -static unsigned long wmarks_high __read_mostly = 500; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring - * and the reclaiming. 400 (40%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 400; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks - * the watermarks. In the case, the system falls back to the LRU-based page - * granularity reclamation logic. 200 (20%) by default. - */ -static unsigned long wmarks_low __read_mostly = 200; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); static struct damon_attrs damon_reclaim_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -214,13 +183,6 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try reclamation for more than quota_ms milliseconds @@ -242,7 +204,7 @@ static struct damos *damon_reclaim_new_scheme(void) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_reclaim_wmarks); } static int damon_reclaim_apply_parameters(void) From c4dbc0f6a8046eed4af57adc690633484747596a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:42 +0000 Subject: [PATCH 173/352] mm/damon/modules-common: implement a stats parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS statistics that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-16-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 4c2ce84869d58..ed973e0770ae9 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -23,3 +23,15 @@ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); From 00a24c188e942864cfe6ac0ee6fb602f89ca6f4e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:43 +0000 Subject: [PATCH 174/352] mm/damon/reclaim: use stat parameters generator This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-17-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7f845f617dc56..1ef8353ac15af 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -136,35 +136,9 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of memory regions that tried to be reclaimed. - */ -static unsigned long nr_reclaim_tried_regions __read_mostly; -module_param(nr_reclaim_tried_regions, ulong, 0400); - -/* - * Total bytes of memory regions that tried to be reclaimed. - */ -static unsigned long bytes_reclaim_tried_regions __read_mostly; -module_param(bytes_reclaim_tried_regions, ulong, 0400); - -/* - * Number of memory regions that successfully be reclaimed. - */ -static unsigned long nr_reclaimed_regions __read_mostly; -module_param(nr_reclaimed_regions, ulong, 0400); - -/* - * Total bytes of memory regions that successfully be reclaimed. - */ -static unsigned long bytes_reclaimed_regions __read_mostly; -module_param(bytes_reclaimed_regions, ulong, 0400); - -/* - * Number of times that the time/space quota limits have exceeded - */ -static unsigned long nr_quota_exceeds __read_mostly; -module_param(nr_quota_exceeds, ulong, 0400); +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -318,13 +292,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) struct damos *s; /* update the stats parameter */ - damon_for_each_scheme(s, c) { - nr_reclaim_tried_regions = s->stat.nr_tried; - bytes_reclaim_tried_regions = s->stat.sz_tried; - nr_reclaimed_regions = s->stat.nr_applied; - bytes_reclaimed_regions = s->stat.sz_applied; - nr_quota_exceeds = s->stat.qt_exceeds; - } + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; return damon_reclaim_handle_commit_inputs(); } From 40e420747da0bc7fe1d62dda15edb6154b1ccc9c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:44 +0000 Subject: [PATCH 175/352] mm/damon/lru_sort: use stat generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-18-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 83 +++++++-------------------------------------- 1 file changed, 12 insertions(+), 71 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 20760b39b50a4..13a752aed2720 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,65 +135,15 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; -module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; -module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Number of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_hot_regions __read_mostly; -module_param(nr_lru_sorted_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_hot_regions __read_mostly; -module_param(bytes_lru_sorted_hot_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for hot regions have exceeded - */ -static unsigned long nr_hot_quota_exceeds __read_mostly; -module_param(nr_hot_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); -/* - * Number of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; -module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; -module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Number of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_cold_regions __read_mostly; -module_param(nr_lru_sorted_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_cold_regions __read_mostly; -module_param(bytes_lru_sorted_cold_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for cold regions have exceeded - */ -static unsigned long nr_cold_quota_exceeds __read_mostly; -module_param(nr_cold_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -397,19 +347,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c) /* update the stats parameter */ damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) { - nr_lru_sort_tried_hot_regions = s->stat.nr_tried; - bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; - nr_lru_sorted_hot_regions = s->stat.nr_applied; - bytes_lru_sorted_hot_regions = s->stat.sz_applied; - nr_hot_quota_exceeds = s->stat.qt_exceeds; - } else if (s->action == DAMOS_LRU_DEPRIO) { - nr_lru_sort_tried_cold_regions = s->stat.nr_tried; - bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; - nr_lru_sorted_cold_regions = s->stat.nr_applied; - bytes_lru_sorted_cold_regions = s->stat.sz_applied; - nr_cold_quota_exceeds = s->stat.qt_exceeds; - } + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; } return damon_lru_sort_handle_commit_inputs(); From b2cf85a3b32903e16f6a269be67542df2e72db9f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:45 +0000 Subject: [PATCH 176/352] mm/damon/modules-common: implement a damos quota params generator DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS quotas that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-19-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index ed973e0770ae9..3e99810b46899 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,6 +17,12 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_sz, quota.sz, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ @@ -33,5 +39,5 @@ 0400); \ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ 0400); \ - module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); From 0f96b855e5dd2fdd022a1bf415ad1e1d8f4b89f1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:46 +0000 Subject: [PATCH 177/352] mm/damon/modules-common: implement damos time quota params generator DAMON_LRU_SORT have module parameters for DAMOS time quota only but size quota. This commit implements a macro for generating the module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-20-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 3e99810b46899..5a4921851d326 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,12 +17,15 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); -#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ module_param_named(quota_ms, quota.ms, ulong, 0600); \ - module_param_named(quota_sz, quota.sz, ulong, 0600); \ module_param_named(quota_reset_interval_ms, \ quota.reset_interval, ulong, 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ From 5152dc6bc718b15260b94e124b6310469e016355 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:47 +0000 Subject: [PATCH 178/352] mm/damon/reclaim: use the quota params generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS quotas using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-21-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 64 +++++++++------------------------------------- 1 file changed, 12 insertions(+), 52 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1ef8353ac15af..1acf808e16242 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -52,44 +52,17 @@ module_param(commit_inputs, bool, 0600); static unsigned long min_age __read_mostly = 120000000; module_param(min_age, ulong, 0600); -/* - * Limit of time for trying the reclamation in milliseconds. - * - * DAMON_RECLAIM tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be - * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, - * the limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * Limit of size of memory for the reclamation in bytes. - * - * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a - * time window (quota_reset_interval_ms) and makes no more than this limit is - * tried. This can be used for limiting consumption of CPU and IO. If this - * value is zero, the limit is disabled. - * - * 128 MiB by default. - */ -static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; -module_param(quota_sz, ulong, 0600); - -/* - * The time/size quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms) and size - * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than - * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms - * milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -157,26 +130,13 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try reclamation for more than quota_ms milliseconds - * or quota_sz bytes within quota_reset_interval_ms. - */ - .ms = quota_ms, - .sz = quota_sz, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, page out older regions first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1 - }; return damon_new_scheme( &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ - "a, + &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ &damon_reclaim_wmarks); } From 15e8e72f59fd4c0c1a15f753c30cca35f42b80b6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:48 +0000 Subject: [PATCH 179/352] mm/damon/lru_sort: use quotas param generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-22-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 70 ++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 13a752aed2720..8d9c3d1fd6bef 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -65,30 +65,17 @@ module_param(hot_thres_access_freq, ulong, 0600); static unsigned long cold_min_age __read_mostly = 120000000; module_param(cold_min_age, ulong, 0600); -/* - * Limit of time for trying the LRU lists sorting in milliseconds. - * - * DAMON_LRU_SORT tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used - * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the - * limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * The time quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms). That is, - * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms - * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -162,19 +149,10 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of hot pages for more than half - * of quota_ms milliseconds within quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark hotter regions accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 1, - .weight_age = 0, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, @@ -200,20 +178,10 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of cold pages for more than - * half of quota_ms milliseconds within - * quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark colder regions not accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for cold pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, From 7fabb02bcf62f938d66ca145c3dd733def971eb4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:49 +0000 Subject: [PATCH 180/352] mm/damon/lru_sort: deduplicate hot/cold schemes generators damon_lru_sort_new_{hot,cold}_scheme() have quite a lot of duplicates. This commit factors out the duplicate to a separate function and use it for reducing the duplicate. Link: https://lkml.kernel.org/r/20220913174449.50645-23-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8d9c3d1fd6bef..07a0908963fd0 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,6 +135,25 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, static struct damon_ctx *ctx; static struct damon_target *target; +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) +{ + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; + + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); +} + /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -149,19 +168,8 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - - /* Use half of total quota for hot pages sorting */ - quota.ms = quota.ms / 2; - return damon_new_scheme( - &pattern, - /* prioritize those on LRU lists, as soon as found */ - DAMOS_LRU_PRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -178,19 +186,8 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - /* Use half of total quota for cold pages sorting */ - quota.ms = quota.ms / 2; - - return damon_new_scheme( - &pattern, - /* mark those as not accessed, as soon as found */ - DAMOS_LRU_DEPRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } static int damon_lru_sort_apply_parameters(void) From aeaccdc8070e863a70eccab03afab4e34c685805 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:24 +0800 Subject: [PATCH 181/352] mm/damon: simplify the parameter passing for 'prepare_access_checks' Patch series "mm/damon: code simplifications and cleanups". This patchset contains some code simplifications and cleanups for DAMON. This patch (of 4): The parameter 'struct damon_ctx *ctx' isn't used in the functions __damon_{p,v}a_prepare_access_check(), so we can remove it and simplify the parameter passing. Link: https://lkml.kernel.org/r/1663060287-30201-1-git-send-email-kaixuxia@tencent.com Link: https://lkml.kernel.org/r/1663060287-30201-2-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 5 ++--- mm/damon/vaddr.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1ada62db68b13..dfeebffe82f44 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -63,8 +63,7 @@ static void damon_pa_mkold(unsigned long paddr) folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(ctx, r); + __damon_pa_prepare_access_check(r); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 0eae47bd9ccbb..3f84584f99826 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -397,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void __damon_va_prepare_access_check(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -416,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(mm, r); mmput(mm); } } From 8ccd48a7838acd569b96f81d6219c855257d1528 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:25 +0800 Subject: [PATCH 182/352] mm/damon/sysfs: simplify the variable 'pid' assignment operation We can initialize the variable 'pid' with '-1' in pid_show() to simplify the variable assignment operation and make the code more readable. Link: https://lkml.kernel.org/r/1663060287-30201-3-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 89f9fa3afd1f8..d947e27be74bc 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2592,19 +2592,16 @@ static ssize_t pid_show(struct kobject *kobj, struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); struct damon_ctx *ctx; - int pid; + int pid = -1; if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; ctx = kdamond->damon_ctx; - if (!ctx) { - pid = -1; + if (!ctx) goto out; - } + mutex_lock(&ctx->kdamond_lock); - if (!ctx->kdamond) - pid = -1; - else + if (ctx->kdamond) pid = ctx->kdamond->pid; mutex_unlock(&ctx->kdamond_lock); out: From cc55ca8fdc7a7b63efb92a5cae39a4bd722fcda2 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:26 +0800 Subject: [PATCH 183/352] mm/damon/core: simplify the kdamond stop mechanism by removing 'done' When the 'kdamond_wait_activation()' function or 'after_sampling()' or 'after_aggregation()' DAMON callbacks return an error, it is unnecessary to use bool 'done' to check if kdamond should be finished. This commit simplifies the kdamond stop mechanism by removing 'done' and break the while loop directly in the cases. Link: https://lkml.kernel.org/r/1663060287-30201-4-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 29635a82cb691..a843673c11cfc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1152,30 +1152,25 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; - bool done = false; pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - done = true; + goto done; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx) && !done) { - if (kdamond_wait_activation(ctx)) { - done = true; - continue; - } + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) { - done = true; - continue; - } + ctx->callback.after_sampling(ctx)) + break; kdamond_usleep(ctx->attrs.sample_interval); @@ -1187,10 +1182,8 @@ static int kdamond_fn(void *data) max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) { - done = true; - continue; - } + ctx->callback.after_aggregation(ctx)) + break; kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); @@ -1204,6 +1197,7 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); } } +done: damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) damon_destroy_region(r, t); From 1da3345d7f53d2d80b2a33e07cd4ef592f0d16fa Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 13 Sep 2022 15:13:58 +0800 Subject: [PATCH 184/352] mm/memcontrol: use kstrtobool for swapaccount param parsing Use kstrtobool which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220913071358.1812206-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f204a2620543..ac6440daf2086 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7507,10 +7507,10 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + bool res; + + if (!kstrtobool(s, &res)) + cgroup_memory_noswap = !res; return 1; } __setup("swapaccount=", setup_swap_account); From d13240be06b199749d716b7f46446ca9f6b5282f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:20:48 -0700 Subject: [PATCH 185/352] mm/page_alloc.c: rename check_free_page() to free_page_is_bad() The name "check_free_page()" provides no information regarding its return value when the page is indeed found to be bad. Renaming it to "free_page_is_bad()" makes it clear that a `true' return value means the page was bad. And make it return a bool, not an int. Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b41e393c1e8d1..76db4343cf6d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1285,20 +1285,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void check_free_page_bad(struct page *page) +static void free_page_is_bad_report(struct page *page) { bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int check_free_page(struct page *page) +static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) - return 0; + return false; /* Something has gone sideways, find it */ - check_free_page_bad(page); - return 1; + free_page_is_bad_report(page); + return true; } static int free_tail_pages_check(struct page *head_page, struct page *page) @@ -1430,7 +1430,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(check_free_page(page + i))) { + if (unlikely(free_page_is_bad(page + i))) { bad++; continue; } @@ -1442,7 +1442,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) - bad += check_free_page(page); + bad += free_page_is_bad(page); if (bad) return false; @@ -1504,7 +1504,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return check_free_page(page); + return free_page_is_bad(page); else return false; } @@ -1525,7 +1525,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { - return check_free_page(page); + return free_page_is_bad(page); } #endif /* CONFIG_DEBUG_VM */ From 8d40853ee3f90d2b3ead10163e577bd7cea025b7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:37:49 -0700 Subject: [PATCH 186/352] mm-page_allocc-rename-check_free_page-to-free_page_is_bad-fix don't use bool as int Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76db4343cf6d1..5a14db2f1d5f6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1441,8 +1441,8 @@ static __always_inline bool free_pages_prepare(struct page *page, page->mapping = NULL; if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); - if (check_free) - bad += free_page_is_bad(page); + if (check_free && free_page_is_bad(page)) + bad++; if (bad) return false; From 719a95c0da099156b4fd423c887a052a59a7048c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:30:38 -0700 Subject: [PATCH 187/352] mm/page_alloc.c: document bulkfree_pcp_prepare() return value Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a14db2f1d5f6..7e26b4d88810e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,6 +1501,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) return free_pages_prepare(page, order, true, FPI_NONE); } +/* return true if this page has an inappropriate state */ static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) From 49187aea772885adafffce9571dfbd07aac65af7 Mon Sep 17 00:00:00 2001 From: ze zuo Date: Tue, 13 Sep 2022 01:55:05 +0000 Subject: [PATCH 188/352] mm/mempolicy: use PAGE_ALIGN instead of open-coding it Replace the simple calculation with PAGE_ALIGN. Link: https://lkml.kernel.org/r/20220913015505.1998958-1-zuoze1@huawei.com Signed-off-by: ze zuo Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 143e2eaaa6ec5..a937eaec5b68d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1270,7 +1270,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1507,7 +1507,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) From 156e38ac6fd5c3de8d0b9b1eaa05263b2a8d9555 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:02 +0800 Subject: [PATCH 189/352] mm: hugetlb: simplify per-node sysfs creation and removal Patch series "simplify handling of per-node sysfs creation and removal", v4. This patch (of 2): The following commit offload per-node sysfs creation and removal to a kworker and did not say why it is needed. And it also said "I don't know that this is absolutely required". It seems like the author was not sure as well. Since it only complicates the code, this patch will revert the changes to simplify the code. 39da08cb074c ("hugetlb: offload per node attribute registrations") We could use memory hotplug notifier to do per-node sysfs creation and removal instead of inserting those operations to node registration and unregistration. Then, it can reduce the code coupling between node.c and hugetlb.c. Also, it can simplify the code. Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Muchun Song Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 139 +------------------------------------------ include/linux/node.h | 24 ++------ mm/hugetlb.c | 35 +++++++---- 3 files changed, 30 insertions(+), 168 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index eb0f43784c2b3..ed391cb09999b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -587,64 +587,9 @@ static const struct attribute_group *node_dev_groups[] = { NULL }; -#ifdef CONFIG_HUGETLBFS -/* - * hugetlbfs per node attributes registration interface: - * When/if hugetlb[fs] subsystem initializes [sometime after this module], - * it will register its per node attributes for all online nodes with - * memory. It will also call register_hugetlbfs_with_node(), below, to - * register its attribute registration functions with this node driver. - * Once these hooks have been initialized, the node driver will call into - * the hugetlb module to [un]register attributes for hot-plugged nodes. - */ -static node_registration_func_t __hugetlb_register_node; -static node_registration_func_t __hugetlb_unregister_node; - -static inline bool hugetlb_register_node(struct node *node) -{ - if (__hugetlb_register_node && - node_state(node->dev.id, N_MEMORY)) { - __hugetlb_register_node(node); - return true; - } - return false; -} - -static inline void hugetlb_unregister_node(struct node *node) -{ - if (__hugetlb_unregister_node) - __hugetlb_unregister_node(node); -} - -void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister) -{ - __hugetlb_register_node = doregister; - __hugetlb_unregister_node = unregister; -} -#else -static inline void hugetlb_register_node(struct node *node) {} - -static inline void hugetlb_unregister_node(struct node *node) {} -#endif - static void node_device_release(struct device *dev) { - struct node *node = to_node(dev); - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - /* - * We schedule the work only when a memory section is - * onlined/offlined on this node. When we come here, - * all the memory on this node has been offlined, - * so we won't enqueue new work to this work. - * - * The work is using node->node_work, so we should - * flush work before freeing the memory. - */ - flush_work(&node->node_work); -#endif - kfree(node); + kfree(to_node(dev)); } /* @@ -665,11 +610,9 @@ static int register_node(struct node *node, int num) if (error) put_device(&node->dev); - else { - hugetlb_register_node(node); - + else compaction_register_node(node); - } + return error; } @@ -683,7 +626,6 @@ static int register_node(struct node *node, int num) void unregister_node(struct node *node) { compaction_unregister_node(node); - hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); @@ -905,74 +847,8 @@ void register_memory_blocks_under_node(int nid, unsigned long start_pfn, (void *)&nid, func); return; } - -#ifdef CONFIG_HUGETLBFS -/* - * Handle per node hstate attribute [un]registration on transistions - * to/from memoryless state. - */ -static void node_hugetlb_work(struct work_struct *work) -{ - struct node *node = container_of(work, struct node, node_work); - - /* - * We only get here when a node transitions to/from memoryless state. - * We can detect which transition occurred by examining whether the - * node has memory now. hugetlb_register_node() already check this - * so we try to register the attributes. If that fails, then the - * node has transitioned to memoryless, try to unregister the - * attributes. - */ - if (!hugetlb_register_node(node)) - hugetlb_unregister_node(node); -} - -static void init_node_hugetlb_work(int nid) -{ - INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); -} - -static int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - switch (action) { - case MEM_ONLINE: - case MEM_OFFLINE: - /* - * offload per node hstate [un]registration to a work thread - * when transitioning to/from memoryless state. - */ - if (nid != NUMA_NO_NODE) - schedule_work(&node_devices[nid]->node_work); - break; - - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HUGETLBFS */ #endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) -static inline int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - return NOTIFY_OK; -} - -static void init_node_hugetlb_work(int nid) { } - -#endif - int __register_one_node(int nid) { int error; @@ -991,8 +867,6 @@ int __register_one_node(int nid) } INIT_LIST_HEAD(&node_devices[nid]->access_list); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); node_init_caches(nid); return error; @@ -1063,13 +937,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { NULL, }; -#define NODE_CALLBACK_PRI 2 /* lower than SLAB */ void __init node_dev_init(void) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); @@ -1079,8 +948,6 @@ void __init node_dev_init(void) if (ret) panic("%s() failed to register subsystem: %d\n", __func__, ret); - register_hotmemory_notifier(&node_memory_callback_nb); - /* * Create all node devices, which will properly link the node * to applicable memory block devices and already created cpu devices. diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd607f7..427a5975cf405 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7f9c591e28f5f..fc598c6211c88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -4000,6 +4001,23 @@ static void hugetlb_register_node(struct node *node) } } +static int __meminit hugetlb_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE) + return NOTIFY_DONE; + + if (action == MEM_GOING_ONLINE) + hugetlb_register_node(node_devices[nid]); + else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) + hugetlb_unregister_node(node_devices[nid]); + + return NOTIFY_OK; +} + /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4009,18 +4027,11 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + get_online_mems(); + hotplug_memory_notifier(hugetlb_memory_callback, 0); + for_each_node_state(nid, N_MEMORY) + hugetlb_register_node(node_devices[nid]); + put_online_mems(); } #else /* !CONFIG_NUMA */ From 1d74392a0b17d40a87092abfa166b682739ebccb Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:03 +0800 Subject: [PATCH 190/352] mm: hugetlb: eliminate memory-less nodes handling The memory-notify-based approach aims to handle meory-less nodes, however, it just adds the complexity of code as pointed by David in thread [1]. The handling of memory-less nodes is introduced by commit 4faf8d950ec4 ("hugetlb: handle memory hot-plug events"). >From its commit message, we cannot find any necessity of handling this case. So, we can simply register/unregister sysfs entries in register_node/unregister_node to simlify the code. BTW, hotplug callback added because in hugetlb_register_all_nodes() we register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91, which said it was a preparation for handling memory-less nodes via memory hotplug. Since we want to remove memory hotplug, so make sure we only register per-node sysfs for online (N_ONLINE) nodes in hugetlb_register_all_nodes(). https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com Suggested-by: David Hildenbrand Signed-off-by: Muchun Song Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 8 +++-- include/linux/hugetlb.h | 14 +++++++++ mm/hugetlb.c | 70 +++++++++++++++++------------------------ 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ed391cb09999b..80b1e91b96081 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -608,10 +609,12 @@ static int register_node(struct node *node, int num) node->dev.groups = node_dev_groups; error = device_register(&node->dev); - if (error) + if (error) { put_device(&node->dev); - else + } else { + hugetlb_register_node(node); compaction_register_node(node); + } return error; } @@ -625,6 +628,7 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + hugetlb_unregister_node(node); compaction_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 890f7b6a2eff7..a6fc49db0ce02 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,6 +16,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fc598c6211c88..a066d0384a1a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3871,24 +3871,8 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return 0; } -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } -} - #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3944,7 +3928,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3974,12 +3958,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -4001,23 +3988,6 @@ static void hugetlb_register_node(struct node *node) } } -static int __meminit hugetlb_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - if (nid == NUMA_NO_NODE) - return NOTIFY_DONE; - - if (action == MEM_GOING_ONLINE) - hugetlb_register_node(node_devices[nid]); - else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) - hugetlb_unregister_node(node_devices[nid]); - - return NOTIFY_OK; -} - /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4027,11 +3997,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - get_online_mems(); - hotplug_memory_notifier(hugetlb_memory_callback, 0); - for_each_node_state(nid, N_MEMORY) + for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); - put_online_mems(); } #else /* !CONFIG_NUMA */ @@ -4055,6 +4022,28 @@ static inline __init void hugetlb_cma_check(void) } #endif +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4109,7 +4098,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP From 7e22b773a169ce82c280b98360ecc60bb94c0973 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Wed, 14 Sep 2022 10:17:38 +0800 Subject: [PATCH 191/352] mm/filemap: make folio_put_wait_locked static It's only used in mm/filemap.c, since commit ("mm/migrate.c: rework migration_entry_wait() to not take a pageref"). Make it static. Link: https://lkml.kernel.org/r/20220914021738.3228011-1-sunke@kylinos.cn Signed-off-by: Ke Sun Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - mm/filemap.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 32846b6306dbd..23125ab87ded2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1039,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page) return folio_wait_locked_killable(page_folio(page)); } -int folio_put_wait_locked(struct folio *folio, int state); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index aab125d423b8f..f27c93a581ab4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1460,7 +1460,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable); * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int folio_put_wait_locked(struct folio *folio, int state) +static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } From c5a515b3dcf7dfb822f45f24665f3ab0998ba1fa Mon Sep 17 00:00:00 2001 From: XU pengfei Date: Wed, 14 Sep 2022 09:21:14 +0800 Subject: [PATCH 192/352] mm/hugetlb: remove unnecessary 'NULL' values from pointer Pointer variables allocate memory first, and then judge. There is no need to initialize the assignment. Link: https://lkml.kernel.org/r/20220914012113.6271-1-xupengfei@nfschina.com Signed-off-by: XU pengfei Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a066d0384a1a3..f9d2c047a7712 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -258,7 +258,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { - struct file_region *nrg = NULL; + struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); @@ -340,7 +340,7 @@ static bool has_same_uncharge_info(struct file_region *rg, static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { - struct file_region *nrg = NULL, *prg = NULL; + struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && From f0e4e9b1fdfd6e87b8a02c881e72a908b4ca6566 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:02 -0700 Subject: [PATCH 193/352] hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race Patch series "hugetlb: Use new vma lock for huge pmd sharing synchronization", v2. hugetlb fault scalability regressions have recently been reported [1]. This is not the first such report, as regressions were also noted when commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") was added [2] in v5.7. At that time, a proposal to address the regression was suggested [3] but went nowhere. The regression and benefit of this patch series is not evident when using the vm_scalability benchmark reported in [2] on a recent kernel. Results from running, "./usemem -n 48 --prealloc --prefault -O -U 3448054972" 48 sample Avg next-20220913 next-20220913 next-20220913 unmodified revert i_mmap_sema locking vma sema locking, this series ----------------------------------------------------------------------------- 498150 KB/s 501934 KB/s 504793 KB/s The recent regression report [1] notes page fault and fork latency of shared hugetlb mappings. To measure this, I created two simple programs: 1) map a shared hugetlb area, write fault all pages, unmap area Do this in a continuous loop to measure faults per second 2) map a shared hugetlb area, write fault a few pages, fork and exit Do this in a continuous loop to measure forks per second These programs were run on a 48 CPU VM with 320GB memory. The shared mapping size was 250GB. For comparison, a single instance of the program was run. Then, multiple instances were run in parallel to introduce lock contention. Changing the locking scheme results in a significant performance benefit. test instances unmodified revert vma -------------------------------------------------------------------------- faults per sec 1 393043 395680 389932 faults per sec 24 71405 81191 79048 forks per sec 1 2802 2747 2725 forks per sec 24 439 536 500 Combined faults 24 1621 68070 53662 Combined forks 24 358 67 142 Combined test is when running both faulting program and forking program simultaneously. Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in the fault path to establish pmd sharing, so this is moved back to huge_pmd_share. With c0d0381ade79 reverted, this race is exposed: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When the new vma lock is put to use in patch 8, this will handle the fault/file truncation races. This is explained in patch 9 where code associated with these races is cleaned up. Patches 3 - 5 restructure existing code in preparation for using the new vma lock (rw semaphore) for pmd sharing synchronization. The idea is that this semaphore will be held in read mode for the duration of fault processing, and held in write mode for unmap operations which may call huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to synchronize huge pmd sharing. However it is only required in the fault path when setting up sharing, and will be acquired in huge_pmd_share(). Patch 6 adds the new vma lock and all supporting routines, but does not actually change code to use the new lock. Patch 7 refactors code in preparation for using the new lock. And, patch 8 finally adds code to make use of this new vma lock. Unfortunately, the fault code and truncate/hole punch code would naturally take locks in the opposite order which could lead to deadlock. Since the performance of page faults is more important, the truncation/hole punch code is modified to back out and take locks in the correct order if necessary. [1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/ [2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/ [3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/ This patch (of 9): Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. The use of i_mmap_rwsem to prevent fault/truncate races depends on this. However, this has been shown to cause performance/scaling issues. As a result, that code will be reverted. Since the use i_mmap_rwsem to address page fault/truncate races depends on this, it must also be reverted. In a subsequent patch, code will be added to detect the fault/truncate race and back out operations as required. Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 30 +++++++++--------------------- mm/hugetlb.c | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a9..a32031e751d14 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -419,9 +419,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,16 +452,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * If folio is mapped, it was faulted in after being @@ -504,8 +497,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +535,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +695,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9d2c047a7712..cbad0bab49ccd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5560,17 +5560,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * i_size is modified when holding i_mmap_rwsem, so check here - * once for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - new_page = false; page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, @@ -5666,6 +5664,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5774,10 +5776,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with i_size modifications during truncation. + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless From ee4055326b23795de7efe414d74d2001f20853a3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:03 -0700 Subject: [PATCH 194/352] hugetlbfs: revert use i_mmap_rwsem for more pmd sharing synchronization Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. However, this has been shown to cause performance/scaling issues. Revert the code and go back to only taking the semaphore in huge_pmd_share during the fault path. Keep the code that takes i_mmap_rwsem in write mode before calling try_to_unmap as this is required if huge_pmd_unshare is called. NOTE: Reverting this code does expose the following race condition. Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) It is unknown if the above race was ever experienced by a user. It was discovered via code inspection when initially addressed. In subsequent patches, a new synchronization mechanism will be added to coordinate pmd sharing and eliminate this race. Link: https://lkml.kernel.org/r/20220914221810.95771-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 -- mm/hugetlb.c | 77 +++++++------------------------------------- mm/rmap.c | 8 +---- mm/userfaultfd.c | 11 ++----- 4 files changed, 15 insertions(+), 83 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a32031e751d14..dfb735a91bbbd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -467,9 +467,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, if (unlikely(folio_mapped(folio))) { BUG_ON(truncate_op); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), (index + 1) * pages_per_huge_page(h), diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbad0bab49ccd..1c63a5099c986 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4770,7 +4770,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; @@ -4782,14 +4781,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4936,8 +4927,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); - } else { - i_mmap_unlock_read(mapping); } return ret; @@ -5346,29 +5335,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * may get SIGKILLed if it later faults. */ if (outside_reserve) { - struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx; - u32 hash; - put_page(old_page); - /* - * Drop hugetlb_fault_mutex and i_mmap_rwsem before - * unmapping. unmapping needs to hold i_mmap_rwsem - * in write mode. Dropping i_mmap_rwsem in read mode - * here is OK as COW mappings do not interact with - * PMD sharing. - * - * Reacquire both after unmap operation. - */ - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - unmap_ref_private(mm, vma, old_page, haddr); - - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5523,9 +5491,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, */ hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); ret = handle_userfault(&vmf, reason); - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); return ret; @@ -5760,11 +5726,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5772,31 +5733,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -5861,7 +5811,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5905,7 +5854,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -6745,12 +6693,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode if - * sharing is possible. For hugetlbfs, this prevents removal of any page - * table entries associated with the address space. This is important as we - * are setting up sharing based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6764,7 +6710,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - i_mmap_assert_locked(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -6794,6 +6740,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); return pte; } @@ -6804,7 +6751,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/rmap.c b/mm/rmap.c index 0b9264e58d256..2a08647a61fca 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,10 +23,9 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) * (see hugetlbfs below) + * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) @@ -45,11 +44,6 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock - * - * * hugetlbfs PageHuge() pages take locks in this order: - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * page->flags PG_locked (lock_page) */ #include diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9c035be2148bb..0fdbd2c05587d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,14 +379,10 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even - * in the case of shared pmds. fault mutex prevents - * races with other faulting threads. + * Serialize via hugetlb_fault_mutex. */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -394,7 +390,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -402,7 +397,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -411,7 +405,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, wp_copy); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); cond_resched(); From 8f28ea28f0b472e848f930c6b7c02d3c4df6dd41 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:04 -0700 Subject: [PATCH 195/352] hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache remove_huge_page removes a hugetlb page from the page cache. Change to hugetlb_delete_from_page_cache as it is a more descriptive name. huge_add_to_page_cache is global in scope, but only deals with hugetlb pages. For consistency and clarity, rename to hugetlb_add_to_page_cache. Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 21 ++++++++++----------- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dfb735a91bbbd..edd69cc43ca5d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,7 +364,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); @@ -478,15 +478,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, folio_lock(folio); /* * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. + * cache BEFORE removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out of memory + * conditions, removal of the region/reserve map could + * fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); + hugetlb_delete_from_page_cache(&folio->page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, @@ -723,7 +722,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -735,7 +734,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -980,7 +979,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t index = page->index; - remove_huge_page(page); + hugetlb_delete_from_page_cache(page); if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a6fc49db0ce02..fcbb1c2a4fabd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -666,7 +666,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1c63a5099c986..aea4a333d1829 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5430,7 +5430,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { struct folio *folio = page_folio(page); @@ -5569,7 +5569,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_page = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = huge_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5981,11 +5981,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* * Serialization between remove_inode_hugepages() and - * huge_add_to_page_cache() below happens through the + * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = huge_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; page_in_pagecache = true; From 4a86f5ca7cad0c7c18c18537db0a3f35a1785b49 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:05 -0700 Subject: [PATCH 196/352] hugetlb: create remove_inode_single_folio to remove single file folio Create the new routine remove_inode_single_folio that will remove a single folio from a file. This is refactored code from remove_inode_hugepages. It checks for the uncommon case in which the folio is still mapped and unmaps. No functional change. This refactoring will be put to use and expanded upon in a subsequent patches. Link: https://lkml.kernel.org/r/20220914221810.95771-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 105 ++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index edd69cc43ca5d..7112a9a9f54df 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -411,6 +411,60 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, } } +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) { + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); + i_mmap_unlock_write(mapping); + } + + folio_lock(folio); + /* + * After locking page, make sure mapping is the same. + * We could have raced with page fault populate and + * backout code. + */ + if (folio_mapping(folio) == mapping) { + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); + } + } + + folio_unlock(folio); + return ret; +} + /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. @@ -418,11 +472,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -456,44 +509,12 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); - - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } - - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache BEFORE removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out of memory - * conditions, removal of the region/reserve map could - * fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); From de13621262e3299d42cf97fcb175a92248a3b902 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:06 -0700 Subject: [PATCH 197/352] hugetlb: rename vma_shareable() and refactor code Rename the routine vma_shareable to vma_addr_pmd_shareable as it is checking a specific address within the vma. Refactor code to check if an aligned range is shareable as this will be needed in a subsequent patch. Link: https://lkml.kernel.org/r/20220914221810.95771-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aea4a333d1829..9dbbabcca8379 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6640,26 +6640,33 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) return true; return false; } +static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, + unsigned long addr) +{ + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + + return __vma_aligned_range_pmd_shareable(vma, start, end); +} + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif - return vma_shareable(vma, addr); + return vma_addr_pmd_shareable(vma, addr); } /* From b4c0fdc4aff61f65478d7bbf8b0fbc16b35c5de3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:07 -0700 Subject: [PATCH 198/352] hugetlb: add vma based lock for pmd sharing Allocate a new hugetlb_vma_lock structure and hang off vm_private_data for synchronization use by vmas that could be involved in pmd sharing. This data structure contains a rw semaphore that is the primary tool used for synchronization. This new structure is ref counted, so that it can exist when NOT attached to a vma. This is only helpful in resolving lock ordering issues where code may need to obtain the vma_lock while there are no guarantees the vma may go away. By obtaining a ref on the structure, it can be guaranteed that at least the rw semaphore will not go away. Only add infrastructure for the new lock here. Actual use will be added in subsequent patches. Link: https://lkml.kernel.org/r/20220914221810.95771-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 43 ++++++++- kernel/fork.c | 6 +- mm/hugetlb.c | 202 ++++++++++++++++++++++++++++++++++++---- mm/rmap.c | 8 +- 4 files changed, 235 insertions(+), 24 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fcbb1c2a4fabd..cfe15b32e2d43 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -115,6 +115,12 @@ struct file_region { #endif }; +struct hugetlb_vma_lock { + struct kref refs; + struct rw_semaphore rw_sema; + struct vm_area_struct *vma; +}; + extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); @@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); -void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, @@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags); +void hugetlb_vma_lock_read(struct vm_area_struct *vma); +void hugetlb_vma_unlock_read(struct vm_area_struct *vma); +void hugetlb_vma_lock_write(struct vm_area_struct *vma); +void hugetlb_vma_unlock_write(struct vm_area_struct *vma); +int hugetlb_vma_trylock_write(struct vm_area_struct *vma); +void hugetlb_vma_assert_locked(struct vm_area_struct *vma); +void hugetlb_vma_lock_release(struct kref *kref); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ -static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } @@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file, return -EINVAL; } +static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + static inline int pmd_huge(pmd_t pmd) { return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 50460330306a8..3d788f759e5f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -674,12 +674,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); + hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ mas.index = tmp->vm_start; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9dbbabcca8379..66496fc424f4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -859,7 +861,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * - * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. @@ -1006,12 +1008,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + /* + * Clear vm_private_data + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + */ + vma->vm_private_data = (void *)0; if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + return; } /* @@ -1042,7 +1052,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -4623,16 +4633,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + hugetlb_vma_lock_alloc(vma); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -6439,6 +6454,11 @@ bool hugetlb_reserve_pages(struct inode *inode, return false; } + /* + * vma specific semaphore used for pmd sharing synchronization + */ + hugetlb_vma_lock_alloc(vma); + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -6462,12 +6482,11 @@ bool hugetlb_reserve_pages(struct inode *inode, resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6562,6 +6581,7 @@ bool hugetlb_reserve_pages(struct inode *inode, hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6641,14 +6661,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, } static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool check_vma_lock) { +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) - return true; - return false; + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (check_vma_lock && !vma->vm_private_data) + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; +} + +static bool vma_pmd_shareable(struct vm_area_struct *vma) +{ + unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return false; + + return __vma_aligned_range_pmd_shareable(vma, start, end, false); } static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, @@ -6657,15 +6697,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; - return __vma_aligned_range_pmd_shareable(vma, start, end); + return __vma_aligned_range_pmd_shareable(vma, start, end, true); } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { -#ifdef CONFIG_USERFAULTFD - if (uffd_disable_huge_pmd_share(vma)) - return false; -#endif return vma_addr_pmd_shareable(vma, addr); } @@ -6696,6 +6732,130 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, *end = ALIGN(*end, PUD_SIZE); } +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_flags_pmd(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. See comment in + * __unmap_hugepage_range_final about how VM_SHARED could + * be set without VM_MAYSHARE. As a result, we need to + * check if either is set in the free path. + */ + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * vma_lock structure may or not be released, but it + * certainly will no longer be attached to vma so clear + * pointer. + */ + vma_lock->vma = NULL; + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + vma->vm_private_data = NULL; + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + /* Check size/alignment for pmd sharing possible */ + if (!vma_pmd_shareable(vma)) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. + */ + return; + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -6782,6 +6942,14 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ +} + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { diff --git a/mm/rmap.c b/mm/rmap.c index 2a08647a61fca..0e179c823e0ac 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * page->flags PG_locked (lock_page) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock @@ -44,6 +44,12 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) */ #include From 4ebf00b6439c516fa6a7b5d5e15c571524f728a6 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 26 Sep 2022 19:58:56 -0700 Subject: [PATCH 199/352] hugetlb: fix build issue for missing hugetlb_vma_lock_release Add a stub for hugetlb_vma_lock_release to build in the case CONFIG_HUGETLB_PAGE && !CONFIG_ARCH_WANT_HUGE_PMD_SHARE. Link: https://lkml.kernel.org/r/YyNUtA1vRASOE4+M@monkey Signed-off-by: Mike Kravetz Reported-by: Stephen Rothwell Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 66496fc424f4b..07d599067cb3b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6942,6 +6942,11 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +void hugetlb_vma_lock_release(struct kref *kref) +{ +} + static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { } From 00f53b790dd6c1642d0032ae3cf70a55b53becc2 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:08 -0700 Subject: [PATCH 200/352] hugetlb: create hugetlb_unmap_file_folio to unmap single file folio Create the new routine hugetlb_unmap_file_folio that will unmap a single file folio. This is refactored code from hugetlb_vmdelete_list. It is modified to do locking within the routine itself and check whether the page is mapped within a specific vma before unmapping. This refactoring will be put to use and expanded upon in a subsequent patch adding vma specific locking. Link: https://lkml.kernel.org/r/20220914221810.95771-8-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 123 +++++++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 29 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7112a9a9f54df..3bb1772fce2f7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -371,6 +371,94 @@ static void hugetlb_delete_from_page_cache(struct page *page) delete_from_page_cache(page); } +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) +{ + struct rb_root_cached *root = &mapping->i_mmap; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); + + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + } + + i_mmap_unlock_write(mapping); +} + static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) @@ -383,30 +471,13 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); } } @@ -428,14 +499,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ - if (unlikely(folio_mapped(folio))) { - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* From bfe910c1488bbcb8cbc2eec55bda8723743939b2 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:09 -0700 Subject: [PATCH 201/352] hugetlb: use new vma_lock for pmd sharing synchronization The new hugetlb vma lock is used to address this race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... The vma_lock is used as follows: - During fault processing. The lock is acquired in read mode before doing a page table lock and allocation (huge_pte_alloc). The lock is held until code is finished with the page table entry (ptep). - The lock must be held in write mode whenever huge_pmd_unshare is called. Lock ordering issues come into play when unmapping a page from all vmas mapping the page. The i_mmap_rwsem must be held to search for the vmas, and the vma lock must be held before calling unmap which will call huge_pmd_unshare. This is done today in: - try_to_migrate_one and try_to_unmap_ for page migration and memory error handling. In these routines we 'try' to obtain the vma lock and fail to unmap if unsuccessful. Calling routines already deal with the failure of unmapping. - hugetlb_vmdelete_list for truncation and hole punch. This routine also tries to acquire the vma lock. If it fails, it skips the unmapping. However, we can not have file truncation or hole punch fail because of contention. After hugetlb_vmdelete_list, truncation and hole punch call remove_inode_hugepages. remove_inode_hugepages checks for mapped pages and call hugetlb_unmap_file_page to unmap them. hugetlb_unmap_file_page is designed to drop locks and reacquire in the correct order to guarantee unmap success. Link: https://lkml.kernel.org/r/20220914221810.95771-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 66 +++++++++++++++++++++++++++- mm/hugetlb.c | 102 +++++++++++++++++++++++++++++++++++++++---- mm/memory.c | 2 + mm/rmap.c | 100 +++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 9 +++- 5 files changed, 233 insertions(+), 46 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3bb1772fce2f7..009ae539b9b24 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h, end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); - +retry: + vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h, if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) continue; + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } } static void @@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); + + /* + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. + */ + hugetlb_vma_unlock_write(vma); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 07d599067cb3b..2cd3f9837ed97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); + } else { + /* + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. + */ + hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4942,6 +4950,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } else { + hugetlb_vma_unlock_read(src_vma); } return ret; @@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); @@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } @@ -5350,8 +5362,29 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, * may get SIGKILLed if it later faults. */ if (outside_reserve) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx; + u32 hash; + put_page(old_page); + /* + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode + * here is OK as COW mappings do not interact with + * PMD sharing. + * + * Reacquire both after unmap operation. + */ + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + unmap_ref_private(mm, vma, old_page, haddr); + + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be + * vma_lock and hugetlb_fault_mutex must be * dropped before handling userfault. Reacquire * after handling fault to make calling code simpler. */ + hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, reason); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); return ret; } @@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something has changed. + */ + hugetlb_vma_lock_read(vma); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) { + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return VM_FAULT_OOM; + } + entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ if (huge_pte_none_mostly(entry)) { @@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(pagecache_page); put_page(pagecache_page); } + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5868,6 +5921,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } out_mutex: + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But @@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); - last_addr_mask = hugetlb_mask_last_page(h); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); @@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages << h->order; @@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + void hugetlb_vma_lock_release(struct kref *kref) { } @@ -7324,6 +7406,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); @@ -7335,6 +7418,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. diff --git a/mm/memory.c b/mm/memory.c index c01c12500169d..b3ed17219d772 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); } } else unmap_page_range(tlb, vma, start, end, details); diff --git a/mm/rmap.c b/mm/rmap.c index 0e179c823e0ac..b6743c2b8b5f4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and fail + * if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and + * fail if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } - /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0fdbd2c05587d..e24e8a47ce8a2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,16 +379,21 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via hugetlb_fault_mutex. + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -396,6 +401,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, if (mode != MCOPY_ATOMIC_CONTINUE && !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -404,6 +410,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, dst_addr, src_addr, mode, &page, wp_copy); + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); cond_resched(); From dd1b1efb25d261b26e7a578d71e2ac0db9d5eeb9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:10 -0700 Subject: [PATCH 202/352] hugetlb: clean up code checking for fault/truncation races With the new hugetlb vma lock in place, it can also be used to handle page fault races with file truncation. The lock is taken at the beginning of the code fault path in read mode. During truncation, it is taken in write mode for each vma which has the file mapped. The file's size (i_size) is modified before taking the vma lock to unmap. How are races handled? The page fault code checks i_size early in processing after taking the vma lock. If the fault is beyond i_size, the fault is aborted. If the fault is not beyond i_size the fault will continue and a new page will be added to the file. It could be that truncation code modifies i_size after the check in fault code. That is OK, as truncation code will soon remove the page. The truncation code will wait until the fault is finished, as it must obtain the vma lock in write mode. This patch cleans up/removes late checks in the fault paths that try to back out pages racing with truncation. As noted above, we just let the truncation code remove the pages. Link: https://lkml.kernel.org/r/20220914221810.95771-10-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 31 ++++++++++++------------------- mm/hugetlb.c | 27 ++++++--------------------- 2 files changed, 18 insertions(+), 40 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 009ae539b9b24..ed57a029eab09 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -568,26 +568,19 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, folio_lock(folio); /* - * After locking page, make sure mapping is the same. - * We could have raced with page fault populate and - * backout code. + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ - if (folio_mapping(folio) == mapping) { - /* - * We must remove the folio from page cache before removing - * the region/ reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the region/reserve - * map could fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - ret = true; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, index, - index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2cd3f9837ed97..5ea0b1b0d1abf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5562,6 +5562,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + bool reserve_alloc = false; /* * Currently, we are forced to kill the process in the event the @@ -5617,6 +5618,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); new_page = true; + if (HPageRestoreReserve(page)) + reserve_alloc = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(page, mapping, idx); @@ -5680,10 +5683,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto backout; - ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5727,10 +5726,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, backout: spin_unlock(ptl); backout_unlocked: - unlock_page(page); - /* restore reserve for newly allocated pages not in page cache */ if (new_page && !new_pagecache_page) restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); put_page(page); goto out; } @@ -6062,26 +6061,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); - /* - * Recheck the i_size after holding PT lock to make sure not - * to leave any page mapped (as page_mapped()) beyond the end - * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. - */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - ret = -EFAULT; - if (idx >= size) - goto out_release_unlock; - - ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ + ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; From d34d28e4f8a7496fc0cabbc9ce4e76ced5474386 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 26 Sep 2022 19:58:56 -0700 Subject: [PATCH 203/352] hugetlb: fix reserve_alloc set but not used compiler warning Error path code checking the reserve_alloc variable was removed as it was determined it was not necessary. However, the variable itself was not removed at the same time. Remove now. Link: https://lkml.kernel.org/r/Yyj7HsJWfHDoU24U@monkey Signed-off-by: Mike Kravetz Reported-by: kernel test robot Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5ea0b1b0d1abf..652d22270db3c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5562,7 +5562,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; - bool reserve_alloc = false; /* * Currently, we are forced to kill the process in the event the @@ -5618,8 +5617,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); new_page = true; - if (HPageRestoreReserve(page)) - reserve_alloc = true; if (vma->vm_flags & VM_MAYSHARE) { int err = hugetlb_add_to_page_cache(page, mapping, idx); From 78f853d58b54ad4bcb018f227f1fecca9952e7bf Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 15 Sep 2022 17:03:35 +0200 Subject: [PATCH 204/352] x86: add missing include to sparsemem.h Patch series "Add KernelMemorySanitizer infrastructure", v7. KernelMemorySanitizer (KMSAN) is a detector of errors related to uses of uninitialized memory. It relies on compile-time Clang instrumentation (similar to MSan in the userspace [1]) and tracks the state of every bit of kernel memory, being able to report an error if uninitialized value is used in a condition, dereferenced, or escapes to userspace, USB or DMA. KMSAN has reported more than 300 bugs in the past few years (recently fixed bugs: [2]), most of them with the help of syzkaller. Such bugs keep getting introduced into the kernel despite new compiler warnings and other analyses (the 6.0 cycle already resulted in several KMSAN-reported bugs, e.g. [3]). Mitigations like total stack and heap initialization are unfortunately very far from being deployable. The proposed patchset contains KMSAN runtime implementation together with small changes to other subsystems needed to make KMSAN work. The latter changes fall into several categories: 1. Changes and refactorings of existing code required to add KMSAN: - [01/43] x86: add missing include to sparsemem.h - [02/43] stackdepot: reserve 5 extra bits in depot_stack_handle_t - [03/43] instrumented.h: allow instrumenting both sides of copy_from_user() - [04/43] x86: asm: instrument usercopy in get_user() and __put_user_size() - [05/43] asm-generic: instrument usercopy in cacheflush.h - [10/43] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE 2. KMSAN-related declarations in generic code, KMSAN runtime library, docs and configs: - [06/43] kmsan: add ReST documentation - [07/43] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks - [09/43] x86: kmsan: pgtable: reduce vmalloc space - [11/43] kmsan: add KMSAN runtime core - [13/43] MAINTAINERS: add entry for KMSAN - [24/43] kmsan: add tests for KMSAN - [31/43] objtool: kmsan: list KMSAN API functions as uaccess-safe - [35/43] x86: kmsan: use __msan_ string functions where possible - [43/43] x86: kmsan: enable KMSAN builds for x86 3. Adding hooks from different subsystems to notify KMSAN about memory state changes: - [14/43] mm: kmsan: maintain KMSAN metadata for page - [15/43] mm: kmsan: call KMSAN hooks from SLUB code - [16/43] kmsan: handle task creation and exiting - [17/43] init: kmsan: call KMSAN initialization routines - [18/43] instrumented.h: add KMSAN support - [19/43] kmsan: add iomap support - [20/43] Input: libps2: mark data received in __ps2_command() as initialized - [21/43] dma: kmsan: unpoison DMA mappings - [34/43] x86: kmsan: handle open-coded assembly in lib/iomem.c - [36/43] x86: kmsan: sync metadata pages on page fault 4. Changes that prevent false reports by explicitly initializing memory, disabling optimized code that may trick KMSAN, selectively skipping instrumentation: - [08/43] kmsan: mark noinstr as __no_sanitize_memory - [12/43] kmsan: disable instrumentation of unsupported common kernel code - [22/43] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() - [23/43] kmsan: handle memory sent to/from USB - [25/43] kmsan: disable strscpy() optimization under KMSAN - [26/43] crypto: kmsan: disable accelerated configs under KMSAN - [27/43] kmsan: disable physical page merging in biovec - [28/43] block: kmsan: skip bio block merging logic for KMSAN - [29/43] kcov: kmsan: unpoison area->list in kcov_remote_area_put() - [30/43] security: kmsan: fix interoperability with auto-initialization - [32/43] x86: kmsan: disable instrumentation of unsupported code - [33/43] x86: kmsan: skip shadow checks in __switch_to() - [37/43] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN - [38/43] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS - [39/43] x86: kmsan: don't instrument stack walking functions - [40/43] entry: kmsan: introduce kmsan_unpoison_entry_regs() 5. Fixes for bugs detected with CONFIG_KMSAN_CHECK_PARAM_RETVAL: - [41/43] bpf: kmsan: initialize BPF registers with zeroes - [42/43] mm: fs: initialize fsdata passed to write_begin/write_end interface This patchset allows one to boot and run a defconfig+KMSAN kernel on a QEMU without known false positives. It however doesn't guarantee there are no false positives in drivers of certain devices or less tested subsystems, although KMSAN is actively tested on syzbot with a large config. By default, KMSAN enforces conservative checks of most kernel function parameters passed by value (via CONFIG_KMSAN_CHECK_PARAM_RETVAL, which maps to the -fsanitize-memory-param-retval compiler flag). As discussed in [4] and [5], passing uninitialized values as function parameters is considered undefined behavior, therefore KMSAN now reports such cases as errors. Several newly added patches fix known manifestations of these errors. This patch (of 43): Including sparsemem.h from other files (e.g. transitively via asm/pgtable_64_types.h) results in compilation errors due to unknown types: sparsemem.h:34:32: error: unknown type name 'phys_addr_t' extern int phys_to_target_node(phys_addr_t start); ^ sparsemem.h:36:39: error: unknown type name 'u64' extern int memory_add_physaddr_to_nid(u64 start); ^ Fix these errors by including linux/types.h from sparsemem.h This is required for the upcoming KMSAN patches. Link: https://lkml.kernel.org/r/20220915150417.722975-1-glider@google.com Link: https://lkml.kernel.org/r/20220915150417.722975-2-glider@google.com Signed-off-by: Dmitry Vyukov Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Cc: Andrey Konovalov Cc: Eric Biggers Signed-off-by: Andrew Morton --- arch/x86/include/asm/sparsemem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 6a9ccc1b2be5d..64df897c0ee30 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SPARSEMEM_H #define _ASM_X86_SPARSEMEM_H +#include + #ifdef CONFIG_SPARSEMEM /* * generic non-linear memory support: From 9593498515c6ac0b21224b81860808aefd43919d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:36 +0200 Subject: [PATCH 205/352] stackdepot: reserve 5 extra bits in depot_stack_handle_t Some users (currently only KMSAN) may want to use spare bits in depot_stack_handle_t. Let them do so by adding @extra_bits to __stack_depot_save() to store arbitrary flags, and providing stack_depot_get_extra_bits() to retrieve those flags. Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN does not intend to store additional information in the stack handle. Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 8 ++++++++ lib/stackdepot.c | 29 ++++++++++++++++++++++++----- mm/kasan/common.c | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index bc2797955de90..9ca7798d7a318 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,9 +14,15 @@ #include typedef u32 depot_stack_handle_t; +/* + * Number of bits in the handle that stack depot doesn't use. Users may store + * information in them. + */ +#define STACK_DEPOT_EXTRA_BITS 5 depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); /* @@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); + int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e73fda23388d8..79e894cf84064 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -43,7 +43,8 @@ #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) + STACK_ALLOC_NULL_PROTECTION_BITS - \ + STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) #define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ @@ -56,6 +57,7 @@ union handle_parts { u32 slabindex : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -77,6 +79,14 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); + static bool init_stack_slab(void **prealloc) { if (!*prealloc) @@ -140,6 +150,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.slabindex = depot_index; stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; + stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); depot_offset += required_size; @@ -382,6 +393,7 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * * @entries: Pointer to storage array * @nr_entries: Size of the storage array + * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * @@ -393,6 +405,10 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * + * Additional opaque flags can be passed in @extra_bits, stored in the unused + * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() + * without calling stack_depot_fetch(). + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -402,10 +418,11 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; - depot_stack_handle_t retval = 0; + union handle_parts retval = { .handle = 0 }; struct page *page = NULL; void *prealloc = NULL; unsigned long flags; @@ -489,9 +506,11 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); } if (found) - retval = found->handle.handle; + retval.handle = found->handle.handle; fast_exit: - return retval; + retval.extra = extra_bits; + + return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -511,6 +530,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 50f4338b477f2..833bf2cfd2a39 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) From 77ec63d39b503dc2b7e85dc3f3bd6e4f76bec446 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:37 +0200 Subject: [PATCH 206/352] instrumented.h: allow instrumenting both sides of copy_from_user() Introduce instrument_copy_from_user_before() and instrument_copy_from_user_after() hooks to be invoked before and after the call to copy_from_user(). KASAN and KCSAN will be only using instrument_copy_from_user_before(), but for KMSAN we'll need to insert code after copy_from_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-4-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/lib/uaccess.c | 3 ++- include/linux/instrumented.h | 21 +++++++++++++++++++-- include/linux/uaccess.h | 19 ++++++++++++++----- lib/iov_iter.c | 9 ++++++--- lib/usercopy.c | 3 ++- 5 files changed, 43 insertions(+), 12 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index d7b3b193d1088..58033dfcb6d45 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -81,8 +81,9 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, might_fault(); if (!should_fail_usercopy()) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 42faebbaa202a..ee8f7d17d34f5 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -120,7 +120,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) } /** - * instrument_copy_from_user - instrument writes of copy_from_user + * instrument_copy_from_user_before - add instrumentation before copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. @@ -130,10 +130,27 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) * @n number of bytes to copy */ static __always_inline void -instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) +instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } +/** + * instrument_copy_from_user_after - add instrumentation after copy_from_user + * + * Instrument writes to kernel memory, that are due to copy_from_user (and + * variants). The instrumentation should be inserted after the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + * @left number of bytes not copied (as returned by copy_from_user) + */ +static __always_inline void +instrument_copy_from_user_after(const void *to, const void __user *from, + unsigned long n, unsigned long left) +{ +} + #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 47e5d374c7ebe..afb18f198843b 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -58,20 +58,28 @@ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - instrument_copy_from_user(to, from, n); + unsigned long res; + + instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long res; + might_fault(); + instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; - instrument_copy_from_user(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } /** @@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c63ce0eadfcb6..ed6d4d8633db8 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -174,13 +174,16 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { + size_t res = n; + if (should_fail_usercopy()) return n; if (access_ok(from, n)) { - instrument_copy_from_user(to, from, n); - n = raw_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } - return n; + return res; } static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, diff --git a/lib/usercopy.c b/lib/usercopy.c index 7413dd300516e..1505a52f23a01 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,8 +12,9 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); From a7c3036613afb34e66082014b409f0797a00ef9e Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:38 +0200 Subject: [PATCH 207/352] x86: asm: instrument usercopy in get_user() and put_user() Use hooks from instrumented.h to notify bug detection tools about usercopy events in variations of get_user() and put_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-5-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/uaccess.h | 22 +++++++++++++++------- include/linux/instrumented.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45f..c1b8982899eca 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ * User space memory access functions */ #include +#include #include #include #include @@ -103,6 +104,7 @@ extern int __get_user_bad(void); : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ + instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) @@ -192,9 +194,11 @@ extern void __put_user_nocheck_8(void); int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ - __chk_user_ptr(ptr); \ - __ptr_pu = (ptr); \ - __val_pu = (x); \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ + __ptr_pu = __ptr; \ + __val_pu = __x; \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ @@ -202,6 +206,7 @@ extern void __put_user_nocheck_8(void); "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ + instrument_put_user(__x, __ptr, sizeof(*(ptr))); \ __builtin_expect(__ret_pu, 0); \ }) @@ -248,23 +253,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ - __put_user_goto(x, ptr, "b", "iq", label); \ + __put_user_goto(__x, ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(x, ptr, "w", "ir", label); \ + __put_user_goto(__x, ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(x, ptr, "l", "ir", label); \ + __put_user_goto(__x, ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(x, ptr, label); \ + __put_user_goto_u64(__x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ + instrument_put_user(__x, ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT @@ -305,6 +312,7 @@ do { \ default: \ (x) = __get_user_bad(); \ } \ + instrument_get_user(x); \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index ee8f7d17d34f5..9f1dba8f717b0 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -153,4 +153,32 @@ instrument_copy_from_user_after(const void *to, const void __user *from, { } +/** + * instrument_get_user() - add instrumentation to get_user()-like macros + * + * get_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @to destination variable, may not be address-taken + */ +#define instrument_get_user(to) \ +({ \ +}) + +/** + * instrument_put_user() - add instrumentation to put_user()-like macros + * + * put_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @from source address + * @ptr userspace pointer to copy to + * @size number of bytes to copy + */ +#define instrument_put_user(from, ptr, size) \ +({ \ +}) + #endif /* _LINUX_INSTRUMENTED_H */ From 23af88ff9529f8c9802a6ccfc005befa154a6faf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:39 +0200 Subject: [PATCH 208/352] asm-generic: instrument usercopy in cacheflush.h Notify memory tools about usercopy events in copy_to_user_page() and copy_from_user_page(). Link: https://lkml.kernel.org/r/20220915150417.722975-6-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/asm-generic/cacheflush.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 4f07afacbc239..f46258d1a080f 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +#include + struct mm_struct; struct vm_area_struct; struct page; @@ -105,14 +107,22 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #ifndef copy_to_user_page #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ + instrument_copy_to_user((void __user *)dst, src, len); \ memcpy(dst, src, len); \ flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif + #ifndef copy_from_user_page -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + do { \ + instrument_copy_from_user_before(dst, (void __user *)src, \ + len); \ + memcpy(dst, src, len); \ + instrument_copy_from_user_after(dst, (void __user *)src, len, \ + 0); \ + } while (0) #endif #endif /* _ASM_GENERIC_CACHEFLUSH_H */ From d2c4c0e439d4db61d6fff7857fd2d4e13a3cb83b Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:40 +0200 Subject: [PATCH 209/352] kmsan: add ReST documentation Add Documentation/dev-tools/kmsan.rst and reference it in the dev-tools index. Link: https://lkml.kernel.org/r/20220915150417.722975-7-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/dev-tools/index.rst | 1 + Documentation/dev-tools/kmsan.rst | 427 ++++++++++++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 Documentation/dev-tools/kmsan.rst diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 4621eac290f46..6b0663075dc04 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst kcov gcov kasan + kmsan ubsan kmemleak kcsan diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst new file mode 100644 index 0000000000000..2a53a801198cb --- /dev/null +++ b/Documentation/dev-tools/kmsan.rst @@ -0,0 +1,427 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2022, Google LLC. + +=================================== +The Kernel Memory Sanitizer (KMSAN) +=================================== + +KMSAN is a dynamic error detector aimed at finding uses of uninitialized +values. It is based on compiler instrumentation, and is quite similar to the +userspace `MemorySanitizer tool`_. + +An important note is that KMSAN is not intended for production use, because it +drastically increases kernel memory footprint and slows the whole system down. + +Usage +===== + +Building the kernel +------------------- + +In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+). +Please refer to `LLVM documentation`_ for the instructions on how to build Clang. + +Now configure and build the kernel with CONFIG_KMSAN enabled. + +Example report +-------------- + +Here is an example of a KMSAN report:: + + ===================================================== + BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test] + test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Uninit was stored to memory at: + do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Local variable uninit created at: + do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + + Bytes 4-7 of 8 are uninitialized + Memory access of size 8 starts at ffff888083fe3da0 + + CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + ===================================================== + +The report says that the local variable ``uninit`` was created uninitialized in +``do_uninit_local_array()``. The third stack trace corresponds to the place +where this variable was created. + +The first stack trace shows where the uninit value was used (in +``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left +uninitialized in the local variable, as well as the stack where the value was +copied to another memory location before use. + +A use of uninitialized value ``v`` is reported by KMSAN in the following cases: + - in a condition, e.g. ``if (v) { ... }``; + - in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``; + - when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``; + - when it is passed as an argument to a function, and + ``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below). + +The mentioned cases (apart from copying data to userspace or hardware, which is +a security issue) are considered undefined behavior from the C11 Standard point +of view. + +Disabling the instrumentation +----------------------------- + +A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN +ignore uninitialized values in that function and mark its output as initialized. +As a result, the user will not get KMSAN reports related to that function. + +Another function attribute supported by KMSAN is ``__no_sanitize_memory``. +Applying this attribute to a function will result in KMSAN not instrumenting +it, which can be helpful if we do not want the compiler to interfere with some +low-level code (e.g. that marked with ``noinstr`` which implicitly adds +``__no_sanitize_memory``). + +This however comes at a cost: stack allocations from such functions will have +incorrect shadow/origin values, likely leading to false positives. Functions +called from non-instrumented code may also receive incorrect metadata for their +parameters. + +As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly. + +It is also possible to disable KMSAN for a single file (e.g. main.o):: + + KMSAN_SANITIZE_main.o := n + +or for the whole directory:: + + KMSAN_SANITIZE := n + +in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every +function in the file or directory. Most users won't need KMSAN_SANITIZE, unless +their code gets broken by KMSAN (e.g. runs at early boot time). + +Support +======= + +In order for KMSAN to work the kernel must be built with Clang, which so far is +the only compiler that has KMSAN support. The kernel instrumentation pass is +based on the userspace `MemorySanitizer tool`_. + +The runtime library only supports x86_64 at the moment. + +How KMSAN works +=============== + +KMSAN shadow memory +------------------- + +KMSAN associates a metadata byte (also called shadow byte) with every byte of +kernel memory. A bit in the shadow byte is set iff the corresponding bit of the +kernel memory byte is uninitialized. Marking the memory uninitialized (i.e. +setting its shadow bytes to ``0xff``) is called poisoning, marking it +initialized (setting the shadow bytes to ``0x00``) is called unpoisoning. + +When a new variable is allocated on the stack, it is poisoned by default by +instrumentation code inserted by the compiler (unless it is a stack variable +that is immediately initialized). Any new heap allocation done without +``__GFP_ZERO`` is also poisoned. + +Compiler instrumentation also tracks the shadow values as they are used along +the code. When needed, instrumentation code invokes the runtime library in +``mm/kmsan/`` to persist shadow values. + +The shadow value of a basic or compound type is an array of bytes of the same +length. When a constant value is written into memory, that memory is unpoisoned. +When a value is read from memory, its shadow memory is also obtained and +propagated into all the operations which use that value. For every instruction +that takes one or more values the compiler generates code that calculates the +shadow of the result depending on those values and their shadows. + +Example:: + + int a = 0xff; // i.e. 0x000000ff + int b; + int c = a | b; + +In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``, +shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of +``c`` are uninitialized, while the lower byte is initialized. + +Origin tracking +--------------- + +Every four bytes of kernel memory also have a so-called origin mapped to them. +This origin describes the point in program execution at which the uninitialized +value was created. Every origin is associated with either the full allocation +stack (for heap-allocated memory), or the function containing the uninitialized +variable (for locals). + +When an uninitialized variable is allocated on stack or heap, a new origin +value is created, and that variable's origin is filled with that value. When a +value is read from memory, its origin is also read and kept together with the +shadow. For every instruction that takes one or more values, the origin of the +result is one of the origins corresponding to any of the uninitialized inputs. +If a poisoned value is written into memory, its origin is written to the +corresponding storage as well. + +Example 1:: + + int a = 42; + int b; + int c = a + b; + +In this case the origin of ``b`` is generated upon function entry, and is +stored to the origin of ``c`` right before the addition result is written into +memory. + +Several variables may share the same origin address, if they are stored in the +same four-byte chunk. In this case every write to either variable updates the +origin for all of them. We have to sacrifice precision in this case, because +storing origins for individual bits (and even bytes) would be too costly. + +Example 2:: + + int combine(short a, short b) { + union ret_t { + int i; + short s[2]; + } ret; + ret.s[0] = a; + ret.s[1] = b; + return ret.i; + } + +If ``a`` is initialized and ``b`` is not, the shadow of the result would be +0xffff0000, and the origin of the result would be the origin of ``b``. +``ret.s[0]`` would have the same origin, but it will never be used, because +that variable is initialized. + +If both function arguments are uninitialized, only the origin of the second +argument is preserved. + +Origin chaining +~~~~~~~~~~~~~~~ + +To ease debugging, KMSAN creates a new origin for every store of an +uninitialized value to memory. The new origin references both its creation stack +and the previous origin the value had. This may cause increased memory +consumption, so we limit the length of origin chains in the runtime. + +Clang instrumentation API +------------------------- + +Clang instrumentation pass inserts calls to functions defined in +``mm/kmsan/nstrumentation.c`` into the kernel code. + +Shadow manipulation +~~~~~~~~~~~~~~~~~~~ + +For every memory access the compiler emits a call to a function that returns a +pair of pointers to the shadow and origin addresses of the given memory:: + + typedef struct { + void *shadow, *origin; + } shadow_origin_ptr_t + + shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) + +The function name depends on the memory access size. + +The compiler makes sure that for every loaded value its shadow and origin +values are read from memory. When a value is stored to memory, its shadow and +origin are also stored using the metadata pointers. + +Handling locals +~~~~~~~~~~~~~~~ + +A special function is used to create a new origin value for a local variable and +set the origin of that variable to that value:: + + void __msan_poison_alloca(void *addr, uintptr_t size, char *descr) + +Access to per-task data +~~~~~~~~~~~~~~~~~~~~~~~ + +At the beginning of every instrumented function KMSAN inserts a call to +``__msan_get_context_state()``:: + + kmsan_context_state *__msan_get_context_state(void) + +``kmsan_context_state`` is declared in ``include/linux/kmsan.h``:: + + struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + depot_stack_handle_t retval_origin_tls; + }; + +This structure is used by KMSAN to pass parameter shadows and origins between +instrumented functions (unless the parameters are checked immediately by +``CONFIG_KMSAN_CHECK_PARAM_RETVAL``). + +Passing uninitialized values to functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Clang's MemorySanitizer instrumentation has an option, +``-fsanitize-memory-param-retval``, which makes the compiler check function +parameters passed by value, as well as function return values. + +The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is +enabled by default to let KMSAN report uninitialized values earlier. +Please refer to the `LKML discussion`_ for more details. + +Because of the way the checks are implemented in LLVM (they are only applied to +parameters marked as ``noundef``), not all parameters are guaranteed to be +checked, so we cannot give up the metadata storage in ``kmsan_context_state``. + +String functions +~~~~~~~~~~~~~~~~ + +The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the +following functions. These functions are also called when data structures are +initialized or copied, making sure shadow and origin values are copied alongside +with the data:: + + void *__msan_memcpy(void *dst, void *src, uintptr_t n) + void *__msan_memmove(void *dst, void *src, uintptr_t n) + void *__msan_memset(void *dst, int c, uintptr_t n) + +Error reporting +~~~~~~~~~~~~~~~ + +For each use of a value the compiler emits a shadow check that calls +``__msan_warning()`` in the case that value is poisoned:: + + void __msan_warning(u32 origin) + +``__msan_warning()`` causes KMSAN runtime to print an error report. + +Inline assembly instrumentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +KMSAN instruments every inline assembly output with a call to:: + + void __msan_instrument_asm_store(void *addr, uintptr_t size) + +, which unpoisons the memory region. + +This approach may mask certain errors, but it also helps to avoid a lot of +false positives in bitwise operations, atomics etc. + +Sometimes the pointers passed into inline assembly do not point to valid memory. +In such cases they are ignored at runtime. + + +Runtime library +--------------- + +The code is located in ``mm/kmsan/``. + +Per-task KMSAN state +~~~~~~~~~~~~~~~~~~~~ + +Every task_struct has an associated KMSAN task state that holds the KMSAN +context (see above) and a per-task flag disallowing KMSAN reports:: + + struct kmsan_context { + ... + bool allow_reporting; + struct kmsan_context_state cstate; + ... + } + + struct task_struct { + ... + struct kmsan_context kmsan; + ... + } + +KMSAN contexts +~~~~~~~~~~~~~~ + +When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to +hold the metadata for function parameters and return values. + +But in the case the kernel is running in the interrupt, softirq or NMI context, +where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state:: + + DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +Metadata allocation +~~~~~~~~~~~~~~~~~~~ + +There are several places in the kernel for which the metadata is stored. + +1. Each ``struct page`` instance contains two pointers to its shadow and +origin pages:: + + struct page { + ... + struct page *shadow, *origin; + ... + }; + +At boot-time, the kernel allocates shadow and origin pages for every available +kernel page. This is done quite late, when the kernel address space is already +fragmented, so normal data pages may arbitrarily interleave with the metadata +pages. + +This means that in general for two contiguous memory pages their shadow/origin +pages may not be contiguous. Consequently, if a memory access crosses the +boundary of a memory block, accesses to shadow/origin memory may potentially +corrupt other pages or read incorrect values from them. + +In practice, contiguous memory pages returned by the same ``alloc_pages()`` +call will have contiguous metadata, whereas if these pages belong to two +different allocations their metadata pages can be fragmented. + +For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions +there also are no guarantees on metadata contiguity. + +In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two +pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions:: + + char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + +``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes. +All stores to ``dummy_store_page`` are ignored. + +2. For vmalloc memory and modules, there is a direct mapping between the memory +range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only +the first quarter available to ``vmalloc()``. The second quarter of the vmalloc +area contains shadow memory for the first quarter, the third one holds the +origins. A small part of the fourth quarter contains shadow and origins for the +kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for +more details. + +When an array of pages is mapped into a contiguous virtual memory space, their +shadow and origin pages are similarly mapped into contiguous regions. + +References +========== + +E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized +memory use in C++ +`_. +In Proceedings of CGO 2015. + +.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html +.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html +.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/ From 7e1e4cb10a79ceffeb283e6d6c1518eb2f48b455 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:41 +0200 Subject: [PATCH 210/352] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks __no_sanitize_memory is a function attribute that instructs KMSAN to skip a function during instrumentation. This is needed to e.g. implement the noinstr functions. __no_kmsan_checks is a function attribute that makes KMSAN ignore the uninitialized values coming from the function's inputs, and initialize the function's outputs. Functions marked with this attribute can't be inlined into functions not marked with it, and vice versa. This behavior is overridden by __always_inline. __SANITIZE_MEMORY__ is a macro that's defined iff the file is instrumented with KMSAN. This is not the same as CONFIG_KMSAN, which is defined for every file. Link: https://lkml.kernel.org/r/20220915150417.722975-8-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 23 +++++++++++++++++++++++ include/linux/compiler-gcc.h | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index c84fec767445d..4fa0cc4cbd2c8 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -51,6 +51,29 @@ #define __no_sanitize_undefined #endif +#if __has_feature(memory_sanitizer) +#define __SANITIZE_MEMORY__ +/* + * Unlike other sanitizers, KMSAN still inserts code into functions marked with + * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation + * provides the behavior consistent with other __no_sanitize_ attributes, + * guaranteeing that __no_sanitize_memory functions remain uninstrumented. + */ +#define __no_sanitize_memory __disable_sanitizer_instrumentation + +/* + * The __no_kmsan_checks attribute ensures that a function does not produce + * false positive reports by: + * - initializing all local variables and memory stores in this function; + * - skipping all shadow checks; + * - passing initialized arguments to this function's callees. + */ +#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory"))) +#else +#define __no_sanitize_memory +#define __no_kmsan_checks +#endif + /* * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together * with no_sanitize("coverage"). Prior versions of Clang support coverage diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 9b157b71036f1..f55a37efdb974 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -114,6 +114,12 @@ #define __SANITIZE_ADDRESS__ #endif +/* + * GCC does not support KMSAN. + */ +#define __no_sanitize_memory +#define __no_kmsan_checks + /* * Turn individual warnings and errors on and off locally, depending * on version. From 31cbca72ded22c0872e87fb4ce2984d23234e8a5 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:42 +0200 Subject: [PATCH 211/352] kmsan: mark noinstr as __no_sanitize_memory noinstr functions should never be instrumented, so make KMSAN skip them by applying the __no_sanitize_memory attribute. Link: https://lkml.kernel.org/r/20220915150417.722975-9-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a3..015207a6e2bf5 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -229,7 +229,8 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ noinline notrace __attribute((__section__(".noinstr.text"))) \ - __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage + __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ + __no_sanitize_memory #endif /* __KERNEL__ */ From 306f9051356056d4a5e0a2db2b6d7b771201bc87 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:43 +0200 Subject: [PATCH 212/352] x86: kmsan: pgtable: reduce vmalloc space KMSAN is going to use 3/4 of existing vmalloc space to hold the metadata, therefore we lower VMALLOC_END to make sure vmalloc() doesn't allocate past the first 1/4. Link: https://lkml.kernel.org/r/20220915150417.722975-10-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_64_types.h | 47 ++++++++++++++++++++++++- arch/x86/mm/init_64.c | 2 +- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 70e360a2e5fb7..04f36063ad546 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -139,7 +139,52 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +/* + * End of the region for which vmalloc page tables are pre-allocated. + * For non-KMSAN builds, this is the same as VMALLOC_END. + * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than + * VMALLOC_START..VMALLOC_END (see below). + */ +#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) + +#ifndef CONFIG_KMSAN +#define VMALLOC_END VMEMORY_END +#else +/* + * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4 + * are used to keep the metadata for virtual pages. The memory formerly + * belonging to vmalloc area is now laid out as follows: + * + * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area + * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to + * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow + * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to + * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins + * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START + * - shadow for modules, + * KMSAN_MODULES_ORIGIN_START to + * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules. + */ +#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2) +#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1) + +/* + * vmalloc metadata addresses are calculated by adding shadow/origin offsets + * to vmalloc address. + */ +#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE +#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1) + +#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET) +#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET) + +/* + * The shadow/origin for modules are placed one by one in the last 1/4 of + * vmalloc space. + */ +#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1) +#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#endif /* CONFIG_KMSAN */ #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0fe690ebc269b..39b6bfcaa0ed4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1287,7 +1287,7 @@ static void __init preallocate_vmalloc_pages(void) unsigned long addr; const char *lvl; - for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; From 62b2a5ae5f342e1e961737e4f5f6aa87ee3f992b Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:44 +0200 Subject: [PATCH 213/352] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE KMSAN adds extra metadata fields to struct page, so it does not fit into 64 bytes anymore. This change leads to increased memory consumption of the nvdimm driver, regardless of whether the kernel is built with KMSAN or not. Link: https://lkml.kernel.org/r/20220915150417.722975-11-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ec5219680092d..85ca5b4da3cf3 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 64 +#define MAX_STRUCT_PAGE_SIZE 128 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0e92ab4b32833..61af072ac98f9 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -787,7 +787,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 64. We + * Also make sure size of struct page is less than 128. We * want to make sure we use large enough size here so that * we don't have a dynamic reserve space depending on * struct page size. But we also want to make sure we notice From cd5e6f4d01b08384e1828fa3bbccfaa04979c2f8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:45 +0200 Subject: [PATCH 214/352] kmsan: add KMSAN runtime core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each memory location KernelMemorySanitizer maintains two types of metadata: 1. The so-called shadow of that location - а byte:byte mapping describing whether or not individual bits of memory are initialized (shadow is 0) or not (shadow is 1). 2. The origins of that location - а 4-byte:4-byte mapping containing 4-byte IDs of the stack traces where uninitialized values were created. Each struct page now contains pointers to two struct pages holding KMSAN metadata (shadow and origins) for the original struct page. Utility routines in mm/kmsan/core.c and mm/kmsan/shadow.c handle the metadata creation, addressing, copying and checking. mm/kmsan/report.c performs error reporting in the cases an uninitialized value is used in a way that leads to undefined behavior. KMSAN compiler instrumentation is responsible for tracking the metadata along with the kernel memory. mm/kmsan/instrumentation.c provides the implementation for instrumentation hooks that are called from files compiled with -fsanitize=kernel-memory. To aid parameter passing (also done at instrumentation level), each task_struct now contains a struct kmsan_task_state used to track the metadata of function parameters and return values for that task. Finally, this patch provides CONFIG_KMSAN that enables KMSAN, and declares CFLAGS_KMSAN, which are applied to files compiled with KMSAN. The KMSAN_SANITIZE:=n Makefile directive can be used to completely disable KMSAN instrumentation for certain files. Similarly, KMSAN_ENABLE_CHECKS:=n disables KMSAN checks and makes newly created stack memory initialized. Users can also use functions from include/linux/kmsan-checks.h to mark certain memory regions as uninitialized or initialized (this is called "poisoning" and "unpoisoning") or check that a particular region is initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-12-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Makefile | 1 + include/linux/kmsan-checks.h | 64 +++++ include/linux/kmsan_types.h | 35 +++ include/linux/mm_types.h | 12 + include/linux/sched.h | 5 + lib/Kconfig.debug | 1 + lib/Kconfig.kmsan | 50 ++++ mm/Makefile | 1 + mm/kmsan/Makefile | 23 ++ mm/kmsan/core.c | 440 +++++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 66 ++++++ mm/kmsan/instrumentation.c | 307 ++++++++++++++++++++++++ mm/kmsan/kmsan.h | 204 ++++++++++++++++ mm/kmsan/report.c | 219 +++++++++++++++++ mm/kmsan/shadow.c | 147 ++++++++++++ scripts/Makefile.kmsan | 8 + scripts/Makefile.lib | 9 + 17 files changed, 1592 insertions(+) create mode 100644 include/linux/kmsan-checks.h create mode 100644 include/linux/kmsan_types.h create mode 100644 lib/Kconfig.kmsan create mode 100644 mm/kmsan/Makefile create mode 100644 mm/kmsan/core.c create mode 100644 mm/kmsan/hooks.c create mode 100644 mm/kmsan/instrumentation.c create mode 100644 mm/kmsan/kmsan.h create mode 100644 mm/kmsan/report.c create mode 100644 mm/kmsan/shadow.c create mode 100644 scripts/Makefile.kmsan diff --git a/Makefile b/Makefile index 952d354069a43..c9f37d25ea634 100644 --- a/Makefile +++ b/Makefile @@ -1015,6 +1015,7 @@ include-y := scripts/Makefile.extrawarn include-$(CONFIG_DEBUG_INFO) += scripts/Makefile.debug include-$(CONFIG_KASAN) += scripts/Makefile.kasan include-$(CONFIG_KCSAN) += scripts/Makefile.kcsan +include-$(CONFIG_KMSAN) += scripts/Makefile.kmsan include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan include-$(CONFIG_KCOV) += scripts/Makefile.kcov include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h new file mode 100644 index 0000000000000..a6522a0c28df9 --- /dev/null +++ b/include/linux/kmsan-checks.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN checks to be used for one-off annotations in subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef _LINUX_KMSAN_CHECKS_H +#define _LINUX_KMSAN_CHECKS_H + +#include + +#ifdef CONFIG_KMSAN + +/** + * kmsan_poison_memory() - Mark the memory range as uninitialized. + * @address: address to start with. + * @size: size of buffer to poison. + * @flags: GFP flags for allocations done by this function. + * + * Until other data is written to this range, KMSAN will treat it as + * uninitialized. Error reports for this memory will reference the call site of + * kmsan_poison_memory() as origin. + */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags); + +/** + * kmsan_unpoison_memory() - Mark the memory range as initialized. + * @address: address to start with. + * @size: size of buffer to unpoison. + * + * Until other data is written to this range, KMSAN will treat it as + * initialized. + */ +void kmsan_unpoison_memory(const void *address, size_t size); + +/** + * kmsan_check_memory() - Check the memory range for being initialized. + * @address: address to start with. + * @size: size of buffer to check. + * + * If any piece of the given range is marked as uninitialized, KMSAN will report + * an error. + */ +void kmsan_check_memory(const void *address, size_t size); + +#else + +static inline void kmsan_poison_memory(const void *address, size_t size, + gfp_t flags) +{ +} +static inline void kmsan_unpoison_memory(const void *address, size_t size) +{ +} +static inline void kmsan_check_memory(const void *address, size_t size) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h new file mode 100644 index 0000000000000..8bfa6c98176d4 --- /dev/null +++ b/include/linux/kmsan_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal header declaring types added by KMSAN to existing kernel structs. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_TYPES_H +#define _LINUX_KMSAN_TYPES_H + +/* These constants are defined in the MSan LLVM instrumentation pass. */ +#define KMSAN_RETVAL_SIZE 800 +#define KMSAN_PARAM_SIZE 800 + +struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + u32 retval_origin_tls; +}; + +#undef KMSAN_PARAM_SIZE +#undef KMSAN_RETVAL_SIZE + +struct kmsan_ctx { + struct kmsan_context_state cstate; + int kmsan_in_runtime; + bool allow_reporting; +}; + +#endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5c87d0f292a23..500e536796ca4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -224,6 +224,18 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_KMSAN + /* + * KMSAN metadata for this page: + * - shadow page: every bit indicates whether the corresponding + * bit of the original page is initialized (0) or not (1); + * - origin page: every 4 bytes contain an id of the stack trace + * where the uninitialized value was created. + */ + struct page *kmsan_shadow; + struct page *kmsan_origin; +#endif + #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index fbac3c19fe354..88a043f7235eb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1362,6 +1363,10 @@ struct task_struct { #endif #endif +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6d1544d9201e4..0129bee7de010 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -970,6 +970,7 @@ config DEBUG_STACKOVERFLOW source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" +source "lib/Kconfig.kmsan" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan new file mode 100644 index 0000000000000..5b19dbd34d76e --- /dev/null +++ b/lib/Kconfig.kmsan @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-only +config HAVE_ARCH_KMSAN + bool + +config HAVE_KMSAN_COMPILER + # Clang versions <14.0.0 also support -fsanitize=kernel-memory, but not + # all the features necessary to build the kernel with KMSAN. + depends on CC_IS_CLANG && CLANG_VERSION >= 140000 + def_bool $(cc-option,-fsanitize=kernel-memory -mllvm -msan-disable-checks=1) + +config KMSAN + bool "KMSAN: detector of uninitialized values use" + depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER + depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT + help + KernelMemorySanitizer (KMSAN) is a dynamic detector of uses of + uninitialized values in the kernel. It is based on compiler + instrumentation provided by Clang and thus requires Clang to build. + + An important note is that KMSAN is not intended for production use, + because it drastically increases kernel memory footprint and slows + the whole system down. + + See for more details. + +if KMSAN + +config HAVE_KMSAN_PARAM_RETVAL + # -fsanitize-memory-param-retval is supported only by Clang >= 14. + depends on HAVE_KMSAN_COMPILER + def_bool $(cc-option,-fsanitize=kernel-memory -fsanitize-memory-param-retval) + +config KMSAN_CHECK_PARAM_RETVAL + bool "Check for uninitialized values passed to and returned from functions" + default y + depends on HAVE_KMSAN_PARAM_RETVAL + help + If the compiler supports -fsanitize-memory-param-retval, KMSAN will + eagerly check every function parameter passed by value and every + function return value. + + Disabling KMSAN_CHECK_PARAM_RETVAL will result in tracking shadow for + function parameters and return values across function borders. This + is a more relaxed mode, but it generates more instrumentation code and + may potentially report errors in corner cases when non-instrumented + functions call instrumented ones. + +endif diff --git a/mm/Makefile b/mm/Makefile index a731d1decbb12..cc23b00525848 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ +obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile new file mode 100644 index 0000000000000..550ad8625e4f9 --- /dev/null +++ b/mm/kmsan/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for KernelMemorySanitizer (KMSAN). +# +# +obj-y := core.o instrumentation.o hooks.o report.o shadow.o + +KMSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +# Disable instrumentation of KMSAN runtime with other tools. +CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector +CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) +CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c new file mode 100644 index 0000000000000..5330138fda5bc --- /dev/null +++ b/mm/kmsan/core.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN runtime library. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kmsan.h" + +bool kmsan_enabled __read_mostly; + +/* + * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is + * unavaliable. + */ +DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags) +{ + u32 extra_bits = + kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE); + bool checked = poison_flags & KMSAN_POISON_CHECK; + depot_stack_handle_t handle; + + handle = kmsan_save_stack_with_flags(flags, extra_bits); + kmsan_internal_set_shadow_origin(address, size, -1, handle, checked); +} + +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked) +{ + kmsan_internal_set_shadow_origin(address, size, 0, 0, checked); +} + +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra) +{ + unsigned long entries[KMSAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); + + /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ + flags &= ~__GFP_DIRECT_RECLAIM; + + return __stack_depot_save(entries, nr_entries, extra, flags, true); +} + +/* Copy the metadata following the memmove() behavior. */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) +{ + depot_stack_handle_t old_origin = 0, new_origin = 0; + int src_slots, dst_slots, i, iter, step, skip_bits; + depot_stack_handle_t *origin_src, *origin_dst; + void *shadow_src, *shadow_dst; + u32 *align_shadow_src, shadow; + bool backwards; + + shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); + if (!shadow_dst) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + + shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); + if (!shadow_src) { + /* + * @src is untracked: zero out destination shadow, ignore the + * origins, we're done. + */ + __memset(shadow_dst, 0, n); + return; + } + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); + + __memmove(shadow_dst, shadow_src, n); + + origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); + origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin_dst || !origin_src); + src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); + KMSAN_WARN_ON((src_slots - dst_slots > 1) || + (dst_slots - src_slots < -1)); + + backwards = dst > src; + i = backwards ? min(src_slots, dst_slots) - 1 : 0; + iter = backwards ? -1 : 1; + + align_shadow_src = + (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); + for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { + KMSAN_WARN_ON(i < 0); + shadow = align_shadow_src[i]; + if (i == 0) { + /* + * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't + * look at the first @src % KMSAN_ORIGIN_SIZE bytes + * of the first shadow slot. + */ + skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + } + if (i == src_slots - 1) { + /* + * If @src + n isn't aligned on + * KMSAN_ORIGIN_SIZE, don't look at the last + * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the + * last shadow slot. + */ + skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + } + /* + * Overwrite the origin only if the corresponding + * shadow is nonempty. + */ + if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { + old_origin = origin_src[i]; + new_origin = kmsan_internal_chain_origin(old_origin); + /* + * kmsan_internal_chain_origin() may return + * NULL, but we don't want to lose the previous + * origin value. + */ + if (!new_origin) + new_origin = old_origin; + } + if (shadow) + origin_dst[i] = new_origin; + else + origin_dst[i] = 0; + } + /* + * If dst_slots is greater than src_slots (i.e. + * dst_slots == src_slots + 1), there is an extra origin slot at the + * beginning or end of the destination buffer, for which we take the + * origin from the previous slot. + * This is only done if the part of the source shadow corresponding to + * slot is non-zero. + * + * E.g. if we copy 8 aligned bytes that are marked as uninitialized + * and have origins o111 and o222, to an unaligned buffer with offset 1, + * these two origins are copied to three origin slots, so one of then + * needs to be duplicated, depending on the copy direction (@backwards) + * + * src shadow: |uuuu|uuuu|....| + * src origin: |o111|o222|....| + * + * backwards = 0: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |....|o111|o222| - fill the empty slot with o111 + * backwards = 1: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |o111|o222|....| - fill the empty slot with o222 + */ + if (src_slots < dst_slots) { + if (backwards) { + shadow = align_shadow_src[src_slots - 1]; + skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + if (shadow) + /* src_slots > 0, therefore dst_slots is at least 2 */ + origin_dst[dst_slots - 1] = + origin_dst[dst_slots - 2]; + } else { + shadow = align_shadow_src[0]; + skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + if (shadow) + origin_dst[0] = origin_dst[1]; + } + } +} + +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) +{ + unsigned long entries[3]; + u32 extra_bits; + int depth; + bool uaf; + + if (!id) + return id; + /* + * Make sure we have enough spare bits in @id to hold the UAF bit and + * the chain depth. + */ + BUILD_BUG_ON( + (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + + extra_bits = stack_depot_get_extra_bits(id); + depth = kmsan_depth_from_eb(extra_bits); + uaf = kmsan_uaf_from_eb(extra_bits); + + /* + * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH. + * This mostly happens in the case structures with uninitialized padding + * are copied around many times. Origin chains for such structures are + * usually periodic, and it does not make sense to fully store them. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + return id; + + depth++; + extra_bits = kmsan_extra_bits(depth, uaf); + + entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; + entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[2] = id; + /* + * @entries is a local var in non-instrumented code, so KMSAN does not + * know it is initialized. Explicitly unpoison it to avoid false + * positives when __stack_depot_save() passes it to instrumented code. + */ + kmsan_internal_unpoison_memory(entries, sizeof(entries), false); + return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, + GFP_ATOMIC, true); +} + +void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, + u32 origin, bool checked) +{ + u64 address = (u64)addr; + void *shadow_start; + u32 *origin_start; + size_t pad = 0; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW); + if (!shadow_start) { + /* + * kmsan_metadata_is_contiguous() is true, so either all shadow + * and origin pages are NULL, or all are non-NULL. + */ + if (checked) { + pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n", + __func__, size, addr); + KMSAN_WARN_ON(true); + } + return; + } + __memset(shadow_start, b, size); + + if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + pad = address % KMSAN_ORIGIN_SIZE; + address -= pad; + size += pad; + } + size = ALIGN(size, KMSAN_ORIGIN_SIZE); + origin_start = + (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); + + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) + origin_start[i] = origin; +} + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) +{ + struct page *page; + + if (!kmsan_internal_is_vmalloc_addr(vaddr) && + !kmsan_internal_is_module_addr(vaddr)) + return NULL; + page = vmalloc_to_page(vaddr); + if (pfn_valid(page_to_pfn(page))) + return page; + else + return NULL; +} + +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason) +{ + depot_stack_handle_t cur_origin = 0, new_origin = 0; + unsigned long addr64 = (unsigned long)addr; + depot_stack_handle_t *origin = NULL; + unsigned char *shadow = NULL; + int cur_off_start = -1; + int chunk_size; + size_t pos = 0; + + if (!size) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + while (pos < size) { + chunk_size = min(size - pos, + PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE)); + shadow = kmsan_get_metadata((void *)(addr64 + pos), + KMSAN_META_SHADOW); + if (!shadow) { + /* + * This page is untracked. If there were uninitialized + * bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos - 1, user_addr, + reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + pos += chunk_size; + continue; + } + for (int i = 0; i < chunk_size; i++) { + if (!shadow[i]) { + /* + * This byte is unpoisoned. If there were + * poisoned bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + continue; + } + origin = kmsan_get_metadata((void *)(addr64 + pos + i), + KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin); + new_origin = *origin; + /* + * Encountered new origin - report the previous + * uninitialized range. + */ + if (cur_origin != new_origin) { + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = new_origin; + cur_off_start = pos + i; + } + } + pos += chunk_size; + } + KMSAN_WARN_ON(pos != size); + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, + user_addr, reason); + kmsan_leave_runtime(); + } +} + +bool kmsan_metadata_is_contiguous(void *addr, size_t size) +{ + char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL, + *next_origin = NULL; + u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE; + depot_stack_handle_t *origin_p; + bool all_untracked = false; + + if (!size) + return true; + + /* The whole range belongs to the same page. */ + if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) == + ALIGN_DOWN(cur_addr, PAGE_SIZE)) + return true; + + cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false); + if (!cur_shadow) + all_untracked = true; + cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true); + if (all_untracked && cur_origin) + goto report; + + for (; next_addr < (u64)addr + size; + cur_addr = next_addr, cur_shadow = next_shadow, + cur_origin = next_origin, next_addr += PAGE_SIZE) { + next_shadow = kmsan_get_metadata((void *)next_addr, false); + next_origin = kmsan_get_metadata((void *)next_addr, true); + if (all_untracked) { + if (next_shadow || next_origin) + goto report; + if (!next_shadow && !next_origin) + continue; + } + if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) && + ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE))) + continue; + goto report; + } + return true; + +report: + pr_err("%s: attempting to access two shadow page ranges.\n", __func__); + pr_err("Access of size %ld at %px.\n", size, addr); + pr_err("Addresses belonging to different ranges: %px and %px\n", + (void *)cur_addr, (void *)next_addr); + pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow, + next_shadow); + pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin, + next_origin); + origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN); + if (origin_p) { + pr_err("Origin: %08x\n", *origin_p); + kmsan_print_origin(*origin_p); + } else { + pr_err("Origin: unavailable\n"); + } + return false; +} diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c new file mode 100644 index 0000000000000..4ac62fa67a02a --- /dev/null +++ b/mm/kmsan/hooks.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN hooks for kernel subsystems. + * + * These functions handle creation of KMSAN metadata for memory allocations. + * + * Copyright (C) 2018-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "../slab.h" +#include "kmsan.h" + +/* + * Instrumented functions shouldn't be called under + * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to + * skipping effects of functions like memset() inside instrumented code. + */ + +/* Functions from kmsan-checks.h follow. */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_poison_memory((void *)address, size, flags, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_poison_memory); + +void kmsan_unpoison_memory(const void *address, size_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_unpoison_memory); + +void kmsan_check_memory(const void *addr, size_t size) +{ + if (!kmsan_enabled) + return; + return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); +} +EXPORT_SYMBOL(kmsan_check_memory); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c new file mode 100644 index 0000000000000..280d154132684 --- /dev/null +++ b/mm/kmsan/instrumentation.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN compiler API. + * + * This file implements __msan_XXX hooks that Clang inserts into the code + * compiled with -fsanitize=kernel-memory. + * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN + * instrumentation works. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" +#include +#include +#include + +static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store) +{ + if ((u64)addr < TASK_SIZE) + return true; + if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW)) + return true; + return false; +} + +static inline struct shadow_origin_ptr +get_shadow_origin_ptr(void *addr, u64 size, bool store) +{ + unsigned long ua_flags = user_access_save(); + struct shadow_origin_ptr ret; + + ret = kmsan_get_shadow_origin_ptr(addr, size, store); + user_access_restore(ua_flags); + return ret; +} + +/* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ false); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); + +/* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ true); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); + +/* + * Declare functions that obtain shadow/origin pointers for loads and stores + * with fixed size. + */ +#define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ false); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ true); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size) + +DECLARE_METADATA_PTR_GETTER(1); +DECLARE_METADATA_PTR_GETTER(2); +DECLARE_METADATA_PTR_GETTER(4); +DECLARE_METADATA_PTR_GETTER(8); + +/* + * Handle a memory store performed by inline assembly. KMSAN conservatively + * attempts to unpoison the outputs of asm() directives to prevent false + * positives caused by missed stores. + */ +void __msan_instrument_asm_store(void *addr, uintptr_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + /* + * Most of the accesses are below 32 bytes. The two exceptions so far + * are clwb() (64 bytes) and FPU state (512 bytes). + * It's unlikely that the assembly will touch more than 512 bytes. + */ + if (size > 512) { + WARN_ONCE(1, "assembly store size too big: %ld\n", size); + size = 8; + } + if (is_bad_asm_addr(addr, size, /*is_store*/ true)) { + user_access_restore(ua_flags); + return; + } + kmsan_enter_runtime(); + /* Unpoisoning the memory on best effort. */ + kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_instrument_asm_store); + +/* + * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset + * intrinsics with calls to respective __msan_ functions. We use + * get_param0_metadata() and set_retval_metadata() to store the shadow/origin + * values for the destination argument of these functions and use them for the + * functions' return values. + */ +static inline void get_param0_metadata(u64 *shadow, + depot_stack_handle_t *origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *shadow = *(u64 *)(ctx->cstate.param_tls); + *origin = ctx->cstate.param_origin_tls[0]; +} + +static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *(u64 *)(ctx->cstate.retval_tls) = shadow; + ctx->cstate.retval_origin_tls = origin; +} + +/* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memmove(dst, src, n); + if (!n) + /* Some people call memmove() with zero length. */ + return result; + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memmove); + +/* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memcpy(dst, src, n); + if (!n) + /* Some people call memcpy() with zero length. */ + return result; + + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* Using memmove instead of memcpy doesn't affect correctness. */ + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memcpy); + +/* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memset(dst, c, n); + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* + * Clang doesn't pass parameter metadata here, so it is impossible to + * use shadow of @c to set up the shadow for @dst. + */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memset); + +/* + * Create a new origin from an old one. This is done when storing an + * uninitialized value to memory. When reporting an error, KMSAN unrolls and + * prints the whole chain of stores that preceded the use of this value. + */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) +{ + depot_stack_handle_t ret = 0; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return ret; + + ua_flags = user_access_save(); + + /* Creating new origins may allocate memory. */ + kmsan_enter_runtime(); + ret = kmsan_internal_chain_origin(origin); + kmsan_leave_runtime(); + user_access_restore(ua_flags); + return ret; +} +EXPORT_SYMBOL(__msan_chain_origin); + +/* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr) +{ + depot_stack_handle_t handle; + unsigned long entries[4]; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN; + entries[1] = (u64)descr; + entries[2] = (u64)__builtin_return_address(0); + /* + * With frame pointers enabled, it is possible to quickly fetch the + * second frame of the caller stack without calling the unwinder. + * Without them, simply do not bother. + */ + if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) + entries[3] = (u64)__builtin_return_address(1); + else + entries[3] = 0; + + /* stack_depot_save() may allocate memory. */ + kmsan_enter_runtime(); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + kmsan_leave_runtime(); + + kmsan_internal_set_shadow_origin(address, size, -1, handle, + /*checked*/ true); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_poison_alloca); + +/* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_unpoison_memory(address, size, /*checked*/ true); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_unpoison_alloca); + +/* + * Report that an uninitialized value with the given origin was used in a way + * that constituted undefined behavior. + */ +void __msan_warning(u32 origin) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_report(origin, /*address*/ 0, /*size*/ 0, + /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0, + REASON_ANY); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_warning); + +/* + * At the beginning of an instrumented function, obtain the pointer to + * `struct kmsan_context_state` holding the metadata for function parameters. + */ +struct kmsan_context_state *__msan_get_context_state(void) +{ + return &kmsan_get_context()->cstate; +} +EXPORT_SYMBOL(__msan_get_context_state); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h new file mode 100644 index 0000000000000..97d48b45dba58 --- /dev/null +++ b/mm/kmsan/kmsan.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions used by the KMSAN runtime. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef __MM_KMSAN_KMSAN_H +#define __MM_KMSAN_KMSAN_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100 +#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200 + +#define KMSAN_POISON_NOCHECK 0x0 +#define KMSAN_POISON_CHECK 0x1 +#define KMSAN_POISON_FREE 0x2 + +#define KMSAN_ORIGIN_SIZE 4 +#define KMSAN_MAX_ORIGIN_DEPTH 7 + +#define KMSAN_STACK_DEPTH 64 + +#define KMSAN_META_SHADOW (false) +#define KMSAN_META_ORIGIN (true) + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + +/* + * A pair of metadata pointers to be returned by the instrumentation functions. + */ +struct shadow_origin_ptr { + void *shadow, *origin; +}; + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, + bool store); +void *kmsan_get_metadata(void *addr, bool is_origin); + +enum kmsan_bug_reason { + REASON_ANY, + REASON_COPY_TO_USER, + REASON_SUBMIT_URB, +}; + +void kmsan_print_origin(depot_stack_handle_t origin); + +/** + * kmsan_report() - Report a use of uninitialized value. + * @origin: Stack ID of the uninitialized value. + * @address: Address at which the memory access happens. + * @size: Memory access size. + * @off_first: Offset (from @address) of the first byte to be reported. + * @off_last: Offset (from @address) of the last byte to be reported. + * @user_addr: When non-NULL, denotes the userspace address to which the kernel + * is leaking data. + * @reason: Error type from enum kmsan_bug_reason. + * + * kmsan_report() prints an error message for a consequent group of bytes + * sharing the same origin. If an uninitialized value is used in a comparison, + * this function is called once without specifying the addresses. When checking + * a memory range, KMSAN may call kmsan_report() multiple times with the same + * @address, @size, @user_addr and @reason, but different @off_first and + * @off_last corresponding to different @origin values. + */ +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason); + +DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +static __always_inline struct kmsan_ctx *kmsan_get_context(void) +{ + return in_task() ? ¤t->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx); +} + +/* + * When a compiler hook or KMSAN runtime function is invoked, it may make a + * call to instrumented code and eventually call itself recursively. To avoid + * that, we guard the runtime entry regions with + * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if + * kmsan_in_runtime() is true. + * + * Non-runtime code may occasionally get executed in nested IRQs from the + * runtime code (e.g. when called via smp_call_function_single()). Because some + * KMSAN routines may take locks (e.g. for memory allocation), we conservatively + * bail out instead of calling them. To minimize the effect of this (potentially + * missing initialization events) kmsan_in_runtime() is not checked in + * non-blocking runtime functions. + */ +static __always_inline bool kmsan_in_runtime(void) +{ + if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) + return true; + return kmsan_get_context()->kmsan_in_runtime; +} + +static __always_inline void kmsan_enter_runtime(void) +{ + struct kmsan_ctx *ctx; + + ctx = kmsan_get_context(); + KMSAN_WARN_ON(ctx->kmsan_in_runtime++); +} + +static __always_inline void kmsan_leave_runtime(void) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + KMSAN_WARN_ON(--ctx->kmsan_in_runtime); +} + +depot_stack_handle_t kmsan_save_stack(void); +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra_bits); + +/* + * Pack and unpack the origin chain depth and UAF flag to/from the extra bits + * provided by the stack depot. + * The UAF flag is stored in the lowest bit, followed by the depth in the upper + * bits. + * set_dsh_extra_bits() is responsible for clamping the value. + */ +static __always_inline unsigned int kmsan_extra_bits(unsigned int depth, + bool uaf) +{ + return (depth << 1) | uaf; +} + +static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits) +{ + return extra_bits & 1; +} + +static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits) +{ + return extra_bits >> 1; +} + +/* + * kmsan_internal_ functions are supposed to be very simple and not require the + * kmsan_in_runtime() checks. + */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n); +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags); +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked); +void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, + u32 origin, bool checked); +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); + +bool kmsan_metadata_is_contiguous(void *addr, size_t size); +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason); + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); + +/* + * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are + * non-instrumented versions of is_module_address() and is_vmalloc_addr() that + * are safe to call from KMSAN runtime without recursion. + */ +static inline bool kmsan_internal_is_module_addr(void *vaddr) +{ + return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END); +} + +static inline bool kmsan_internal_is_vmalloc_addr(void *addr) +{ + return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END); +} + +#endif /* __MM_KMSAN_KMSAN_H */ diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c new file mode 100644 index 0000000000000..02736ec757f2c --- /dev/null +++ b/mm/kmsan/report.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN error reporting routines. + * + * Copyright (C) 2019-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include + +#include "kmsan.h" + +static DEFINE_RAW_SPINLOCK(kmsan_report_lock); +#define DESCR_SIZE 128 +/* Protected by kmsan_report_lock */ +static char report_local_descr[DESCR_SIZE]; +int panic_on_kmsan __read_mostly; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kmsan." +module_param_named(panic, panic_on_kmsan, int, 0); + +/* + * Skip internal KMSAN frames. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], + int num_entries) +{ + int len, skip; + char buf[64]; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", + (void *)stack_entries[skip]); + + /* Never show __msan_* or kmsan_* functions. */ + if ((strnstr(buf, "__msan_", len) == buf) || + (strnstr(buf, "kmsan_", len) == buf)) + continue; + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* + * Currently the descriptions of locals generated by Clang look as follows: + * ----local_name@function_name + * We want to print only the name of the local, as other information in that + * description can be confusing. + * The meaningful part of the description is copied to a global buffer to avoid + * allocating memory. + */ +static char *pretty_descr(char *descr) +{ + int pos = 0, len = strlen(descr); + + for (int i = 0; i < len; i++) { + if (descr[i] == '@') + break; + if (descr[i] == '-') + continue; + report_local_descr[pos] = descr[i]; + if (pos + 1 == DESCR_SIZE) + break; + pos++; + } + report_local_descr[pos] = 0; + return report_local_descr; +} + +void kmsan_print_origin(depot_stack_handle_t origin) +{ + unsigned long *entries = NULL, *chained_entries = NULL; + unsigned int nr_entries, chained_nr_entries, skipnr; + void *pc1 = NULL, *pc2 = NULL; + depot_stack_handle_t head; + unsigned long magic; + char *descr = NULL; + unsigned int depth; + + if (!origin) + return; + + while (true) { + nr_entries = stack_depot_fetch(origin, &entries); + depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin)); + magic = nr_entries ? entries[0] : 0; + if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) { + descr = (char *)entries[1]; + pc1 = (void *)entries[2]; + pc2 = (void *)entries[3]; + pr_err("Local variable %s created at:\n", + pretty_descr(descr)); + if (pc1) + pr_err(" %pSb\n", pc1); + if (pc2) + pr_err(" %pSb\n", pc2); + break; + } + if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) { + /* + * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are + * not stored, so the output may be incomplete. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + pr_err("\n\n"); + head = entries[1]; + origin = entries[2]; + pr_err("Uninit was stored to memory at:\n"); + chained_nr_entries = + stack_depot_fetch(head, &chained_entries); + kmsan_internal_unpoison_memory( + chained_entries, + chained_nr_entries * sizeof(*chained_entries), + /*checked*/ false); + skipnr = get_stack_skipnr(chained_entries, + chained_nr_entries); + stack_trace_print(chained_entries + skipnr, + chained_nr_entries - skipnr, 0); + pr_err("\n"); + continue; + } + pr_err("Uninit was created at:\n"); + if (nr_entries) { + skipnr = get_stack_skipnr(entries, nr_entries); + stack_trace_print(entries + skipnr, nr_entries - skipnr, + 0); + } else { + pr_err("(stack is not available)\n"); + } + break; + } +} + +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason) +{ + unsigned long stack_entries[KMSAN_STACK_DEPTH]; + int num_stack_entries, skipnr; + char *bug_type = NULL; + unsigned long ua_flags; + bool is_uaf; + + if (!kmsan_enabled) + return; + if (!current->kmsan_ctx.allow_reporting) + return; + if (!origin) + return; + + current->kmsan_ctx.allow_reporting = false; + ua_flags = user_access_save(); + raw_spin_lock(&kmsan_report_lock); + pr_err("=====================================================\n"); + is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin)); + switch (reason) { + case REASON_ANY: + bug_type = is_uaf ? "use-after-free" : "uninit-value"; + break; + case REASON_COPY_TO_USER: + bug_type = is_uaf ? "kernel-infoleak-after-free" : + "kernel-infoleak"; + break; + case REASON_SUBMIT_URB: + bug_type = is_uaf ? "kernel-usb-infoleak-after-free" : + "kernel-usb-infoleak"; + break; + } + + num_stack_entries = + stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + + pr_err("BUG: KMSAN: %s in %pSb\n", bug_type, + (void *)stack_entries[skipnr]); + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + pr_err("\n"); + + kmsan_print_origin(origin); + + if (size) { + pr_err("\n"); + if (off_first == off_last) + pr_err("Byte %d of %d is uninitialized\n", off_first, + size); + else + pr_err("Bytes %d-%d of %d are uninitialized\n", + off_first, off_last, size); + } + if (address) + pr_err("Memory access of size %d starts at %px\n", size, + address); + if (user_addr && reason == REASON_COPY_TO_USER) + pr_err("Data copied to user address %px\n", user_addr); + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("=====================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + raw_spin_unlock(&kmsan_report_lock); + if (panic_on_kmsan) + panic("kmsan.panic set ...\n"); + user_access_restore(ua_flags); + current->kmsan_ctx.allow_reporting = true; +} diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c new file mode 100644 index 0000000000000..acc5279acc3be --- /dev/null +++ b/mm/kmsan/shadow.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN shadow implementation. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "kmsan.h" + +#define shadow_page_for(page) ((page)->kmsan_shadow) + +#define origin_page_for(page) ((page)->kmsan_origin) + +static void *shadow_ptr_for(struct page *page) +{ + return page_address(shadow_page_for(page)); +} + +static void *origin_ptr_for(struct page *page) +{ + return page_address(origin_page_for(page)); +} + +static bool page_has_metadata(struct page *page) +{ + return shadow_page_for(page) && origin_page_for(page); +} + +static void set_no_shadow_origin_page(struct page *page) +{ + shadow_page_for(page) = NULL; + origin_page_for(page) = NULL; +} + +/* + * Dummy load and store pages to be used when the real metadata is unavailable. + * There are separate pages for loads and stores, so that every load returns a + * zero, and every store doesn't affect other loads. + */ +static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static unsigned long vmalloc_meta(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr, off; + + KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE)); + if (kmsan_internal_is_vmalloc_addr(addr)) { + off = addr64 - VMALLOC_START; + return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START : + KMSAN_VMALLOC_SHADOW_START); + } + if (kmsan_internal_is_module_addr(addr)) { + off = addr64 - MODULES_VADDR; + return off + (is_origin ? KMSAN_MODULES_ORIGIN_START : + KMSAN_MODULES_SHADOW_START); + } + return 0; +} + +static struct page *virt_to_page_or_null(void *vaddr) +{ + if (kmsan_virt_addr_valid(vaddr)) + return virt_to_page(vaddr); + else + return NULL; +} + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size, + bool store) +{ + struct shadow_origin_ptr ret; + void *shadow; + + /* + * Even if we redirect this memory access to the dummy page, it will + * go out of bounds. + */ + KMSAN_WARN_ON(size > PAGE_SIZE); + + if (!kmsan_enabled) + goto return_dummy; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size)); + shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW); + if (!shadow) + goto return_dummy; + + ret.shadow = shadow; + ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN); + return ret; + +return_dummy: + if (store) { + /* Ignore this store. */ + ret.shadow = dummy_store_page; + ret.origin = dummy_store_page; + } else { + /* This load will return zero. */ + ret.shadow = dummy_load_page; + ret.origin = dummy_load_page; + } + return ret; +} + +/* + * Obtain the shadow or origin pointer for the given address, or NULL if there's + * none. The caller must check the return value for being non-NULL if needed. + * The return value of this function should not depend on whether we're in the + * runtime or not. + */ +void *kmsan_get_metadata(void *address, bool is_origin) +{ + u64 addr = (u64)address, pad, off; + struct page *page; + + if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { + pad = addr % KMSAN_ORIGIN_SIZE; + addr -= pad; + } + address = (void *)addr; + if (kmsan_internal_is_vmalloc_addr(address) || + kmsan_internal_is_module_addr(address)) + return (void *)vmalloc_meta(address, is_origin); + + page = virt_to_page_or_null(address); + if (!page) + return NULL; + if (!page_has_metadata(page)) + return NULL; + off = addr % PAGE_SIZE; + + return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; +} diff --git a/scripts/Makefile.kmsan b/scripts/Makefile.kmsan new file mode 100644 index 0000000000000..b5b0aa61322ec --- /dev/null +++ b/scripts/Makefile.kmsan @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +kmsan-cflags := -fsanitize=kernel-memory + +ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL +kmsan-cflags += -fsanitize-memory-param-retval +endif + +export CFLAGS_KMSAN := $(kmsan-cflags) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3fb6a99e78c47..ac32429e93b73 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -157,6 +157,15 @@ _c_flags += $(if $(patsubst n%,, \ endif endif +ifeq ($(CONFIG_KMSAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_SANITIZE_$(basetarget).o)$(KMSAN_SANITIZE)y), \ + $(CFLAGS_KMSAN)) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_ENABLE_CHECKS_$(basetarget).o)$(KMSAN_ENABLE_CHECKS)y), \ + , -mllvm -msan-disable-checks=1) +endif + ifeq ($(CONFIG_UBSAN),y) _c_flags += $(if $(patsubst n%,, \ $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \ From 0c72ada2f2e72e7f4c8991e0104207b6fed6a3e3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:46 +0200 Subject: [PATCH 215/352] kmsan: disable instrumentation of unsupported common kernel code EFI stub cannot be linked with KMSAN runtime, so we disable instrumentation for it. Instrumenting kcov, stackdepot or lockdep leads to infinite recursion caused by instrumentation hooks calling instrumented code again. Link: https://lkml.kernel.org/r/20220915150417.722975-13-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/firmware/efi/libstub/Makefile | 1 + kernel/Makefile | 1 + kernel/locking/Makefile | 3 ++- lib/Makefile | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d0537573501e9..81432d0c904b1 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -46,6 +46,7 @@ GCOV_PROFILE := n # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d32..d754e0be1176d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # Don't instrument error handlers diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f382..ea925731fa40f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index d7d94102991b3..42e185cdecd03 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -270,6 +270,9 @@ obj-$(CONFIG_POLYNOMIAL) += polynomial.o CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +# In particular, instrumenting stackdepot.c with KMSAN will result in infinite +# recursion. +KMSAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n obj-$(CONFIG_REF_TRACKER) += ref_tracker.o From 1cac487a8c75dcafab19ea4224c3600feba9359e Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:47 +0200 Subject: [PATCH 216/352] MAINTAINERS: add entry for KMSAN Add entry for KMSAN maintainers/reviewers. Link: https://lkml.kernel.org/r/20220915150417.722975-14-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 61345a9a9d055..6e7c9493c22f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11370,6 +11370,19 @@ F: kernel/kmod.c F: lib/test_kmod.c F: tools/testing/selftests/kmod/ +KMSAN +M: Alexander Potapenko +R: Marco Elver +R: Dmitry Vyukov +L: kasan-dev@googlegroups.com +S: Maintained +F: Documentation/dev-tools/kmsan.rst +F: arch/*/include/asm/kmsan.h +F: include/linux/kmsan*.h +F: lib/Kconfig.kmsan +F: mm/kmsan/ +F: scripts/Makefile.kmsan + KPROBES M: Naveen N. Rao M: Anil S Keshavamurthy From d02f5e8bdafbbe1bd5b8589710d5b21b3e1a5e5d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:48 +0200 Subject: [PATCH 217/352] mm: kmsan: maintain KMSAN metadata for page operations Insert KMSAN hooks that make the necessary bookkeeping changes: - poison page shadow and origins in alloc_pages()/free_page(); - clear page shadow and origins in clear_page(), copy_user_highpage(); - copy page metadata in copy_highpage(), wp_page_copy(); - handle vmap()/vunmap()/iounmap(); Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_64.h | 7 ++ arch/x86/mm/ioremap.c | 3 + include/linux/highmem.h | 3 + include/linux/kmsan.h | 145 +++++++++++++++++++++++++++++++++ mm/internal.h | 6 ++ mm/kmsan/hooks.c | 86 +++++++++++++++++++ mm/kmsan/shadow.c | 113 +++++++++++++++++++++++++ mm/memory.c | 2 + mm/page_alloc.c | 11 +++ mm/vmalloc.c | 20 ++++- 10 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 include/linux/kmsan.h diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index baa70451b8df5..198e03e59ca19 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -8,6 +8,8 @@ #include #include +#include + /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; @@ -47,6 +49,11 @@ void clear_page_erms(void *page); static inline void clear_page(void *page) { + /* + * Clean up KMSAN metadata for the page being cleared. The assembly call + * below clobbers @page, so we perform unpoisoning before it. + */ + kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1ad0228f8ceb9..78c5bc654cff5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -479,6 +480,8 @@ void iounmap(volatile void __iomem *addr) return; } + kmsan_iounmap_page_range((unsigned long)addr, + (unsigned long)addr + get_vm_area_size(p)); memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25679035ca283..e9912da5441b4 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } @@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h new file mode 100644 index 0000000000000..b36bf3db835ee --- /dev/null +++ b/include/linux/kmsan.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN API for subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include +#include +#include + +struct page; + +#ifdef CONFIG_KMSAN + +/** + * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. + * @page: struct page pointer returned by alloc_pages(). + * @order: order of allocated struct page. + * @flags: GFP flags used by alloc_pages() + * + * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless + * @flags contain __GFP_ZERO. + */ +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags); + +/** + * kmsan_free_page() - Notify KMSAN about a free_pages() call. + * @page: struct page pointer passed to free_pages(). + * @order: order of deallocated struct page. + * + * KMSAN marks freed memory as uninitialized. + */ +void kmsan_free_page(struct page *page, unsigned int order); + +/** + * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages. + * @dst: destination page. + * @src: source page. + * + * KMSAN copies the contents of metadata pages for @src into the metadata pages + * for @dst. If @dst has no associated metadata pages, nothing happens. + * If @src has no associated metadata pages, @dst metadata pages are unpoisoned. + */ +void kmsan_copy_page_meta(struct page *dst, struct page *src); + +/** + * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. + * @start: start of vmapped range. + * @end: end of vmapped range. + * @prot: page protection flags used for vmap. + * @pages: array of pages. + * @page_shift: page_shift passed to vmap_range_noflush(). + * + * KMSAN maps shadow and origin pages of @pages into contiguous ranges in + * vmalloc metadata address range. + */ +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +/** + * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. + * @start: start of vunmapped range. + * @end: end of vunmapped range. + * + * KMSAN unmaps the contiguous metadata ranges created by + * kmsan_map_kernel_range_noflush(). + */ +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); + +/** + * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call. + * @addr: range start. + * @end: range end. + * @phys_addr: physical range start. + * @prot: page protection flags used for ioremap_page_range(). + * @page_shift: page_shift argument passed to vmap_range_noflush(). + * + * KMSAN creates new metadata pages for the physical pages mapped into the + * virtual memory. + */ +void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); + +/** + * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. + * @start: range start. + * @end: range end. + * + * KMSAN unmaps the metadata pages for the given range and, unlike for + * vunmap_page_range(), also deallocates them. + */ +void kmsan_iounmap_page_range(unsigned long start, unsigned long end); + +#else + +static inline int kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) +{ + return 0; +} + +static inline void kmsan_free_page(struct page *page, unsigned int order) +{ +} + +static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ +} + +static inline void kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) +{ +} + +static inline void kmsan_vunmap_range_noflush(unsigned long start, + unsigned long end) +{ +} + +static inline void kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) +{ +} + +static inline void kmsan_iounmap_page_range(unsigned long start, + unsigned long end) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_H */ diff --git a/mm/internal.h b/mm/internal.h index e5b2d40991816..883b09f024044 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -818,8 +818,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + void vunmap_range_noflush(unsigned long start, unsigned long end); +void __vunmap_range_noflush(unsigned long start, unsigned long end); + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4ac62fa67a02a..040111bb9f6a3 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,91 @@ * skipping effects of functions like memset() inside instrumented code. */ +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +void kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index acc5279acc3be..8c81a059beea6 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,3 +145,116 @@ void *kmsan_get_metadata(void *address, bool is_origin) return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped; + + if (!kmsan_enabled) + return; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) + goto ret; + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + KMSAN_WARN_ON(mapped); + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + KMSAN_WARN_ON(mapped); + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); +} diff --git a/mm/memory.c b/mm/memory.c index b3ed17219d772..118e5f023597c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3136,6 +3137,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; } + kmsan_copy_page_meta(new_page, old_page); } if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e26b4d88810e..354077c8f4429 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1400,6 +1401,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -3808,6 +3810,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, @@ -5552,6 +5562,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a991b909866f2..ccaa461998f3c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -320,6 +320,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end, err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); + if (!err) + kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -416,7 +419,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -void vunmap_range_noflush(unsigned long start, unsigned long end) +void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -438,6 +441,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -575,7 +584,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -601,6 +610,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map From a6c351f0210fb1b2d0f494bfd0da4ca97bd209c8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:49 +0200 Subject: [PATCH 218/352] mm: kmsan: call KMSAN hooks from SLUB code In order to report uninitialized memory coming from heap allocations KMSAN has to poison them unless they're created with __GFP_ZERO. It's handy that we need KMSAN hooks in the places where init_on_alloc/init_on_free initialization is performed. In addition, we apply __no_kmsan_checks to get_freepointer_safe() to suppress reports when accessing freelist pointers that reside in freed objects. Link: https://lkml.kernel.org/r/20220915150417.722975-16-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 57 ++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 76 +++++++++++++++++++++++++++++++++++++++++++ mm/slab.h | 1 + mm/slub.c | 17 ++++++++++ 4 files changed, 151 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index b36bf3db835ee..5c4e0079054e6 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -14,6 +14,7 @@ #include struct page; +struct kmem_cache; #ifdef CONFIG_KMSAN @@ -48,6 +49,44 @@ void kmsan_free_page(struct page *page, unsigned int order); */ void kmsan_copy_page_meta(struct page *dst, struct page *src); +/** + * kmsan_slab_alloc() - Notify KMSAN about a slab allocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * @flags: GFP flags passed to the allocator. + * + * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the + * newly created object, marking it as initialized or uninitialized. + */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); + +/** + * kmsan_slab_free() - Notify KMSAN about a slab deallocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * + * KMSAN marks the freed object as uninitialized. + */ +void kmsan_slab_free(struct kmem_cache *s, void *object); + +/** + * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation. + * @ptr: object pointer. + * @size: object size. + * @flags: GFP flags passed to the allocator. + * + * Similar to kmsan_slab_alloc(), but for large allocations. + */ +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); + +/** + * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation. + * @ptr: object pointer. + * + * Similar to kmsan_slab_free(), but for large allocations. + */ +void kmsan_kfree_large(const void *ptr); + /** * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. * @start: start of vmapped range. @@ -114,6 +153,24 @@ static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) { } +static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object, + gfp_t flags) +{ +} + +static inline void kmsan_slab_free(struct kmem_cache *s, void *object) +{ +} + +static inline void kmsan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) +{ +} + +static inline void kmsan_kfree_large(const void *ptr) +{ +} + static inline void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 040111bb9f6a3..000703c563a4d 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,82 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (unlikely(object == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * There's a ctor or this is an RCU cache - do nothing. The memory + * status hasn't changed since last use. + */ + if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU)) + return; + + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory(object, s->object_size, + KMSAN_POISON_CHECK); + else + kmsan_internal_poison_memory(object, s->object_size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_slab_free(struct kmem_cache *s, void *object) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))) + return; + /* + * If there's a constructor, freed memory must remain in the same state + * until the next allocation. We cannot save its state to detect + * use-after-free bugs, instead we just keep it unpoisoned. + */ + if (s->ctor) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +{ + if (unlikely(ptr == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory((void *)ptr, size, + /*checked*/ true); + else + kmsan_internal_poison_memory((void *)ptr, size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_kfree_large(const void *ptr) +{ + struct page *page; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + page = virt_to_head_page((void *)ptr); + KMSAN_WARN_ON(ptr != page_address(page)); + kmsan_internal_poison_memory((void *)ptr, + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + static unsigned long vmalloc_shadow(unsigned long addr) { return (unsigned long)kmsan_get_metadata((void *)addr, diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ecd..9d0afd2985df7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -729,6 +729,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); diff --git a/mm/slub.c b/mm/slub.c index 6953c3367bc20..ce8310e131b34 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,17 @@ static void prefetch_freepointer(const struct kmem_cache *s, void *object) prefetchw(object + s->offset); } +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; @@ -1709,6 +1721,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) ptr = kasan_kmalloc_large(ptr, size, flags); /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); return ptr; } @@ -1716,12 +1729,14 @@ static __always_inline void kfree_hook(void *x) { kmemleak_free(x); kasan_kfree_large(x); + kmsan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); debug_check_no_locks_freed(x, s->object_size); @@ -5941,6 +5956,7 @@ static char *create_unique_id(struct kmem_cache *s) p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); + kmsan_unpoison_memory(name, p - name); return name; } @@ -6042,6 +6058,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } From d328ef7a0046ab245cf202e817cfe317cd655f77 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:50 +0200 Subject: [PATCH 219/352] kmsan: handle task creation and exiting Tell KMSAN that a new task is created, so the tool creates a backing metadata structure for that task. Link: https://lkml.kernel.org/r/20220915150417.722975-17-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 21 +++++++++++++++++++++ kernel/exit.c | 2 ++ kernel/fork.c | 2 ++ mm/kmsan/core.c | 10 ++++++++++ mm/kmsan/hooks.c | 17 +++++++++++++++++ mm/kmsan/kmsan.h | 2 ++ 6 files changed, 54 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 5c4e0079054e6..354aee6f7b1a2 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -15,9 +15,22 @@ struct page; struct kmem_cache; +struct task_struct; #ifdef CONFIG_KMSAN +/** + * kmsan_task_create() - Initialize KMSAN state for the task. + * @task: task to initialize. + */ +void kmsan_task_create(struct task_struct *task); + +/** + * kmsan_task_exit() - Notify KMSAN that a task has exited. + * @task: task about to finish. + */ +void kmsan_task_exit(struct task_struct *task); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -139,6 +152,14 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_task_create(struct task_struct *task) +{ +} + +static inline void kmsan_task_exit(struct task_struct *task) +{ +} + static inline int kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/kernel/exit.c b/kernel/exit.c index 98a33bd7c25c5..1899d73bdfb72 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -742,6 +743,7 @@ void __noreturn do_exit(long code) WARN_ON(tsk->plug); kcov_task_exit(tsk); + kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); diff --git a/kernel/fork.c b/kernel/fork.c index 3d788f759e5f1..3c2f8601b2b82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1023,6 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->worker_private = NULL; kcov_task_init(tsk); + kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 5330138fda5bc..112dce135c7f6 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -37,6 +37,16 @@ bool kmsan_enabled __read_mostly; */ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); +void kmsan_internal_task_create(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + struct thread_info *info = current_thread_info(); + + __memset(ctx, 0, sizeof(*ctx)); + ctx->allow_reporting = true; + kmsan_internal_unpoison_memory(info, sizeof(*info), false); +} + void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, unsigned int poison_flags) { diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 000703c563a4d..6f3e64b0b61f8 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,23 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_task_create(struct task_struct *task) +{ + kmsan_enter_runtime(); + kmsan_internal_task_create(task); + kmsan_leave_runtime(); +} + +void kmsan_task_exit(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ctx->allow_reporting = false; +} + void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) { if (unlikely(object == NULL)) diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 97d48b45dba58..77ee068c04ae9 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -180,6 +180,8 @@ void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, u32 origin, bool checked); depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); +void kmsan_internal_task_create(struct task_struct *task); + bool kmsan_metadata_is_contiguous(void *addr, size_t size); void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); From b4e5e8c6b5dbd5fbfe0cdc08886951fe1011c267 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:51 +0200 Subject: [PATCH 220/352] init: kmsan: call KMSAN initialization routines kmsan_init_shadow() scans the mappings created at boot time and creates metadata pages for those mappings. When the memblock allocator returns pages to pagealloc, we reserve 2/3 of those pages and use them as metadata for the remaining 1/3. Once KMSAN starts, every page allocated by pagealloc has its associated shadow and origin pages. kmsan_initialize() initializes the bookkeeping for init_task and enables KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-18-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 36 +++++++ init/main.c | 3 + mm/kmsan/Makefile | 3 +- mm/kmsan/init.c | 235 ++++++++++++++++++++++++++++++++++++++++++ mm/kmsan/kmsan.h | 3 + mm/kmsan/shadow.c | 34 ++++++ mm/page_alloc.c | 4 + 7 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 mm/kmsan/init.c diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 354aee6f7b1a2..e00de976ee438 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -31,6 +31,28 @@ void kmsan_task_create(struct task_struct *task); */ void kmsan_task_exit(struct task_struct *task); +/** + * kmsan_init_shadow() - Initialize KMSAN shadow at boot time. + * + * Allocate and initialize KMSAN metadata for early allocations. + */ +void __init kmsan_init_shadow(void); + +/** + * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN. + */ +void __init kmsan_init_runtime(void); + +/** + * kmsan_memblock_free_pages() - handle freeing of memblock pages. + * @page: struct page to free. + * @order: order of @page. + * + * Freed pages are either returned to buddy allocator or held back to be used + * as metadata pages. + */ +bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -152,6 +174,20 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_init_shadow(void) +{ +} + +static inline void kmsan_init_runtime(void) +{ +} + +static inline bool kmsan_memblock_free_pages(struct page *page, + unsigned int order) +{ + return true; +} + static inline void kmsan_task_create(struct task_struct *task) { } diff --git a/init/main.c b/init/main.c index eebe0cad4e378..93b000f2de8d7 100644 --- a/init/main.c +++ b/init/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -837,6 +838,7 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); + kmsan_init_shadow(); stack_depot_early_init(); mem_init(); mem_init_print_info(); @@ -857,6 +859,7 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); + kmsan_init_runtime(); } #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 550ad8625e4f9..401acb1a491ce 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -3,7 +3,7 @@ # Makefile for KernelMemorySanitizer (KMSAN). # # -obj-y := core.o instrumentation.o hooks.o report.o shadow.o +obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o KMSAN_SANITIZE := n KCOV_INSTRUMENT := n @@ -18,6 +18,7 @@ CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c new file mode 100644 index 0000000000000..7fb794242fad0 --- /dev/null +++ b/mm/kmsan/init.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN initialization routines. + * + * Copyright (C) 2017-2021 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" + +#include +#include +#include + +#include "../internal.h" + +#define NUM_FUTURE_RANGES 128 +struct start_end_pair { + u64 start, end; +}; + +static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata; +static int future_index __initdata; + +/* + * Record a range of memory for which the metadata pages will be created once + * the page allocator becomes available. + */ +static void __init kmsan_record_future_shadow_range(void *start, void *end) +{ + u64 nstart = (u64)start, nend = (u64)end, cstart, cend; + bool merged = false; + + KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); + KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend); + nstart = ALIGN_DOWN(nstart, PAGE_SIZE); + nend = ALIGN(nend, PAGE_SIZE); + + /* + * Scan the existing ranges to see if any of them overlaps with + * [start, end). In that case, merge the two ranges instead of + * creating a new one. + * The number of ranges is less than 20, so there is no need to organize + * them into a more intelligent data structure. + */ + for (int i = 0; i < future_index; i++) { + cstart = start_end_pairs[i].start; + cend = start_end_pairs[i].end; + if ((cstart < nstart && cend < nstart) || + (cstart > nend && cend > nend)) + /* ranges are disjoint - do not merge */ + continue; + start_end_pairs[i].start = min(nstart, cstart); + start_end_pairs[i].end = max(nend, cend); + merged = true; + break; + } + if (merged) + return; + start_end_pairs[future_index].start = nstart; + start_end_pairs[future_index].end = nend; + future_index++; +} + +/* + * Initialize the shadow for existing mappings during kernel initialization. + * These include kernel text/data sections, NODE_DATA and future ranges + * registered while creating other data (e.g. percpu). + * + * Allocations via memblock can be only done before slab is initialized. + */ +void __init kmsan_init_shadow(void) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + phys_addr_t p_start, p_end; + u64 loop; + int nid; + + for_each_reserved_mem_range(loop, &p_start, &p_end) + kmsan_record_future_shadow_range(phys_to_virt(p_start), + phys_to_virt(p_end)); + /* Allocate shadow for .data */ + kmsan_record_future_shadow_range(_sdata, _edata); + + for_each_online_node(nid) + kmsan_record_future_shadow_range( + NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size); + + for (int i = 0; i < future_index; i++) + kmsan_init_alloc_meta_for_range( + (void *)start_end_pairs[i].start, + (void *)start_end_pairs[i].end); +} + +struct metadata_page_pair { + struct page *shadow, *origin; +}; +static struct metadata_page_pair held_back[MAX_ORDER] __initdata; + +/* + * Eager metadata allocation. When the memblock allocator is freeing pages to + * pagealloc, we use 2/3 of them as metadata for the remaining 1/3. + * We store the pointers to the returned blocks of pages in held_back[] grouped + * by their order: when kmsan_memblock_free_pages() is called for the first + * time with a certain order, it is reserved as a shadow block, for the second + * time - as an origin block. On the third time the incoming block receives its + * shadow and origin ranges from the previously saved shadow and origin blocks, + * after which held_back[order] can be used again. + * + * At the very end there may be leftover blocks in held_back[]. They are + * collected later by kmsan_memblock_discard(). + */ +bool kmsan_memblock_free_pages(struct page *page, unsigned int order) +{ + struct page *shadow, *origin; + + if (!held_back[order].shadow) { + held_back[order].shadow = page; + return false; + } + if (!held_back[order].origin) { + held_back[order].origin = page; + return false; + } + shadow = held_back[order].shadow; + origin = held_back[order].origin; + kmsan_setup_meta(page, shadow, origin, order); + + held_back[order].shadow = NULL; + held_back[order].origin = NULL; + return true; +} + +#define MAX_BLOCKS 8 +struct smallstack { + struct page *items[MAX_BLOCKS]; + int index; + int order; +}; + +static struct smallstack collect = { + .index = 0, + .order = MAX_ORDER, +}; + +static void smallstack_push(struct smallstack *stack, struct page *pages) +{ + KMSAN_WARN_ON(stack->index == MAX_BLOCKS); + stack->items[stack->index] = pages; + stack->index++; +} +#undef MAX_BLOCKS + +static struct page *smallstack_pop(struct smallstack *stack) +{ + struct page *ret; + + KMSAN_WARN_ON(stack->index == 0); + stack->index--; + ret = stack->items[stack->index]; + stack->items[stack->index] = NULL; + return ret; +} + +static void do_collection(void) +{ + struct page *page, *shadow, *origin; + + while (collect.index >= 3) { + page = smallstack_pop(&collect); + shadow = smallstack_pop(&collect); + origin = smallstack_pop(&collect); + kmsan_setup_meta(page, shadow, origin, collect.order); + __free_pages_core(page, collect.order); + } +} + +static void collect_split(void) +{ + struct smallstack tmp = { + .order = collect.order - 1, + .index = 0, + }; + struct page *page; + + if (!collect.order) + return; + while (collect.index) { + page = smallstack_pop(&collect); + smallstack_push(&tmp, &page[0]); + smallstack_push(&tmp, &page[1 << tmp.order]); + } + __memcpy(&collect, &tmp, sizeof(tmp)); +} + +/* + * Memblock is about to go away. Split the page blocks left over in held_back[] + * and return 1/3 of that memory to the system. + */ +static void kmsan_memblock_discard(void) +{ + /* + * For each order=N: + * - push held_back[N].shadow and .origin to @collect; + * - while there are >= 3 elements in @collect, do garbage collection: + * - pop 3 ranges from @collect; + * - use two of them as shadow and origin for the third one; + * - repeat; + * - split each remaining element from @collect into 2 ranges of + * order=N-1, + * - repeat. + */ + collect.order = MAX_ORDER - 1; + for (int i = MAX_ORDER - 1; i >= 0; i--) { + if (held_back[i].shadow) + smallstack_push(&collect, held_back[i].shadow); + if (held_back[i].origin) + smallstack_push(&collect, held_back[i].origin); + held_back[i].shadow = NULL; + held_back[i].origin = NULL; + do_collection(); + collect_split(); + } +} + +void __init kmsan_init_runtime(void) +{ + /* Assuming current is init_task */ + kmsan_internal_task_create(current); + kmsan_memblock_discard(); + pr_info("Starting KernelMemorySanitizer\n"); + pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n"); + kmsan_enabled = true; +} diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 77ee068c04ae9..7019c46d33a74 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -67,6 +67,7 @@ struct shadow_origin_ptr { struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, bool store); void *kmsan_get_metadata(void *addr, bool is_origin); +void __init kmsan_init_alloc_meta_for_range(void *start, void *end); enum kmsan_bug_reason { REASON_ANY, @@ -187,6 +188,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order); /* * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 8c81a059beea6..6e90a806a7045 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -258,3 +258,37 @@ void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, kfree(s_pages); kfree(o_pages); } + +/* Allocate metadata for pages allocated at boot time. */ +void __init kmsan_init_alloc_meta_for_range(void *start, void *end) +{ + struct page *shadow_p, *origin_p; + void *shadow, *origin; + struct page *page; + u64 size; + + start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); + size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + shadow = memblock_alloc(size, PAGE_SIZE); + origin = memblock_alloc(size, PAGE_SIZE); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { + page = virt_to_page_or_null((char *)start + addr); + shadow_p = virt_to_page_or_null((char *)shadow + addr); + set_no_shadow_origin_page(shadow_p); + shadow_page_for(page) = shadow_p; + origin_p = virt_to_page_or_null((char *)origin + addr); + set_no_shadow_origin_page(origin_p); + origin_page_for(page) = origin_p; + } +} + +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order) +{ + for (int i = 0; i < (1 << order); i++) { + set_no_shadow_origin_page(&shadow[i]); + set_no_shadow_origin_page(&origin[i]); + shadow_page_for(&page[i]) = &shadow[i]; + origin_page_for(&page[i]) = &origin[i]; + } +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 354077c8f4429..62d88813f3cfb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,6 +1809,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } __free_pages_core(page, order); } From 0afd3a73cdc8a5ff0e8d8c11dcca49fb1986c34f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:52 +0200 Subject: [PATCH 221/352] instrumented.h: add KMSAN support To avoid false positives, KMSAN needs to unpoison the data copied from the userspace. To detect infoleaks - check the memory buffer passed to copy_to_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-19-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 18 ++++++++++++----- include/linux/kmsan-checks.h | 19 ++++++++++++++++++ mm/kmsan/hooks.c | 38 ++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 9f1dba8f717b0..501fa84867494 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -2,7 +2,7 @@ /* * This header provides generic wrappers for memory access instrumentation that - * the compiler cannot emit for: KASAN, KCSAN. + * the compiler cannot emit for: KASAN, KCSAN, KMSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H @@ -10,6 +10,7 @@ #include #include #include +#include #include /** @@ -117,6 +118,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); + kmsan_copy_to_user(to, from, n, 0); } /** @@ -151,6 +153,7 @@ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, unsigned long n, unsigned long left) { + kmsan_unpoison_memory(to, n - left); } /** @@ -162,10 +165,14 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * * @to destination variable, may not be address-taken */ -#define instrument_get_user(to) \ -({ \ +#define instrument_get_user(to) \ +({ \ + u64 __tmp = (u64)(to); \ + kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \ + to = __tmp; \ }) + /** * instrument_put_user() - add instrumentation to put_user()-like macros * @@ -177,8 +184,9 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * @ptr userspace pointer to copy to * @size number of bytes to copy */ -#define instrument_put_user(from, ptr, size) \ -({ \ +#define instrument_put_user(from, ptr, size) \ +({ \ + kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \ }) #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h index a6522a0c28df9..c4cae333deec5 100644 --- a/include/linux/kmsan-checks.h +++ b/include/linux/kmsan-checks.h @@ -46,6 +46,21 @@ void kmsan_unpoison_memory(const void *address, size_t size); */ void kmsan_check_memory(const void *address, size_t size); +/** + * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace. + * @to: destination address in the userspace. + * @from: source address in the kernel. + * @to_copy: number of bytes to copy. + * @left: number of bytes not copied. + * + * If this is a real userspace data transfer, KMSAN checks the bytes that were + * actually copied to ensure there was no information leak. If @to belongs to + * the kernel space (which is possible for compat syscalls), KMSAN just copies + * the metadata. + */ +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left); + #else static inline void kmsan_poison_memory(const void *address, size_t size, @@ -58,6 +73,10 @@ static inline void kmsan_unpoison_memory(const void *address, size_t size) static inline void kmsan_check_memory(const void *address, size_t size) { } +static inline void kmsan_copy_to_user(void __user *to, const void *from, + size_t to_copy, size_t left) +{ +} #endif diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 6f3e64b0b61f8..5c0eb25d984d7 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -205,6 +205,44 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end) kmsan_leave_runtime(); } +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * At this point we've copied the memory already. It's hard to check it + * before copying, as the size of actually copied buffer is unknown. + */ + + /* copy_to_user() may copy zero bytes. No need to check. */ + if (!to_copy) + return; + /* Or maybe copy_to_user() failed to copy anything. */ + if (to_copy <= left) + return; + + ua_flags = user_access_save(); + if ((u64)to < TASK_SIZE) { + /* This is a user memory access, check it. */ + kmsan_internal_check_memory((void *)from, to_copy - left, to, + REASON_COPY_TO_USER); + } else { + /* Otherwise this is a kernel memory access. This happens when a + * compat syscall passes an argument allocated on the kernel + * stack to a real syscall. + * Don't check anything, just copy the shadow of the copied + * bytes. + */ + kmsan_internal_memmove_metadata((void *)to, (void *)from, + to_copy - left); + } + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_copy_to_user); + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { From fcf187766e525693481f80995efd984cca3e411a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:53 +0200 Subject: [PATCH 222/352] kmsan: add iomap support Functions from lib/iomap.c interact with hardware, so KMSAN must ensure that: - every read function returns an initialized value - every write function checks values before sending them to hardware. Link: https://lkml.kernel.org/r/20220915150417.722975-20-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/iomap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/iomap.c b/lib/iomap.c index fbaa3e8f19d6c..4f8b31baa5752 100644 --- a/lib/iomap.c +++ b/lib/iomap.c @@ -6,6 +6,7 @@ */ #include #include +#include #include @@ -70,26 +71,35 @@ static void bad_io_access(unsigned long port, const char *access) #define mmio_read64be(addr) swab64(readq(addr)) #endif +/* + * Here and below, we apply __no_kmsan_checks to functions reading data from + * hardware, to ensure that KMSAN marks their return values as initialized. + */ +__no_kmsan_checks unsigned int ioread8(const void __iomem *addr) { IO_COND(addr, return inb(port), return readb(addr)); return 0xff; } +__no_kmsan_checks unsigned int ioread16(const void __iomem *addr) { IO_COND(addr, return inw(port), return readw(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread16be(const void __iomem *addr) { IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread32(const void __iomem *addr) { IO_COND(addr, return inl(port), return readl(addr)); return 0xffffffff; } +__no_kmsan_checks unsigned int ioread32be(const void __iomem *addr) { IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); @@ -142,18 +152,21 @@ static u64 pio_read64be_hi_lo(unsigned long port) return lo | (hi << 32); } +__no_kmsan_checks u64 ioread64_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64be_lo_hi(port), @@ -161,6 +174,7 @@ u64 ioread64be_lo_hi(const void __iomem *addr) return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64be_hi_lo(port), @@ -188,22 +202,32 @@ EXPORT_SYMBOL(ioread64be_hi_lo); void iowrite8(u8 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outb(val,port), writeb(val, addr)); } void iowrite16(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outw(val,port), writew(val, addr)); } void iowrite16be(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); } void iowrite32(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outl(val,port), writel(val, addr)); } void iowrite32be(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); } EXPORT_SYMBOL(iowrite8); @@ -239,24 +263,32 @@ static void pio_write64be_hi_lo(u64 val, unsigned long port) void iowrite64_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_lo_hi(val, port), writeq(val, addr)); } void iowrite64_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_hi_lo(val, port), writeq(val, addr)); } void iowrite64be_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_lo_hi(val, port), mmio_write64be(val, addr)); } void iowrite64be_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_hi_lo(val, port), mmio_write64be(val, addr)); } @@ -328,14 +360,20 @@ static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count); } void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 2); } void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 4); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); @@ -343,14 +381,20 @@ EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count); IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 2); IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 4); IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); } EXPORT_SYMBOL(iowrite8_rep); From a23e20682504c70be8752d5a36bdecee4dcfea6d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:54 +0200 Subject: [PATCH 223/352] input: libps2: mark data received in __ps2_command() as initialized KMSAN does not know that the device initializes certain bytes in ps2dev->cmdbuf. Call kmsan_unpoison_memory() to explicitly mark them as initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-21-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/input/serio/libps2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 250e213cc80c6..3e19344eda93c 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,9 +295,11 @@ int __ps2_command(struct ps2dev *ps2dev, u8 *param, unsigned int command) serio_pause_rx(ps2dev->serio); - if (param) + if (param) { for (i = 0; i < receive; i++) param[i] = ps2dev->cmdbuf[(receive - 1) - i]; + kmsan_unpoison_memory(param, receive); + } if (ps2dev->cmdcnt && (command != PS2_CMD_RESET_BAT || ps2dev->cmdcnt != 1)) { From 4681650684f4b643c96ca95c195498948b595116 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:55 +0200 Subject: [PATCH 224/352] dma: kmsan: unpoison DMA mappings KMSAN doesn't know about DMA memory writes performed by devices. We unpoison such memory when it's mapped to avoid false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-22-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 41 ++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 10 +++++--- mm/kmsan/hooks.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e00de976ee438..dac296da45c55 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -9,6 +9,7 @@ #ifndef _LINUX_KMSAN_H #define _LINUX_KMSAN_H +#include #include #include #include @@ -16,6 +17,7 @@ struct page; struct kmem_cache; struct task_struct; +struct scatterlist; #ifdef CONFIG_KMSAN @@ -172,6 +174,35 @@ void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, */ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); +/** + * kmsan_handle_dma() - Handle a DMA data transfer. + * @page: first page of the buffer. + * @offset: offset of the buffer within the first page. + * @size: buffer size. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffer, if it is copied to device; + * * initializes the buffer, if it is copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir); + +/** + * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist. + * @sg: scatterlist holding DMA buffers. + * @nents: number of scatterlist entries. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffers in the scatterlist, if they are copied to device; + * * initializes the buffers, if they are copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir); + #else static inline void kmsan_init_shadow(void) @@ -254,6 +285,16 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } +static inline void kmsan_handle_dma(struct page *page, size_t offset, + size_t size, enum dma_data_direction dir) +{ +} + +static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de71..a8400aa9bcd4e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); + kmsan_handle_dma(page, offset, size, dir); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, else ents = ops->map_sg(dev, sg, nents, dir, attrs); - if (ents > 0) + if (ents > 0) { + kmsan_handle_dma_sg(sg, nents, dir); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); - else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != -EREMOTEIO)) + } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != -EREMOTEIO)) { return -EIO; + } return ents; } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5c0eb25d984d7..563c09443a37a 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -10,10 +10,12 @@ */ #include +#include #include #include #include #include +#include #include #include @@ -243,6 +245,63 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +static void kmsan_handle_dma_page(const void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_TO_DEVICE: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + break; + case DMA_FROM_DEVICE: + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_NONE: + break; + } +} + +/* Helper function to handle DMA data transfers. */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ + u64 page_offset, to_go, addr; + + if (PageHighMem(page)) + return; + addr = (u64)page_address(page) + offset; + /* + * The kernel may occasionally give us adjacent DMA pages not belonging + * to the same allocation. Process them separately to avoid triggering + * internal KMSAN checks. + */ + while (size > 0) { + page_offset = addr % PAGE_SIZE; + to_go = min(PAGE_SIZE - page_offset, (u64)size); + kmsan_handle_dma_page((void *)addr, to_go, dir); + addr += to_go; + size -= to_go; + } +} + +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + struct scatterlist *item; + int i; + + for_each_sg(sg, item, nents, i) + kmsan_handle_dma(sg_page(item), item->offset, item->length, + dir); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { From c00946dca5742c330e9e0e2cc9309bff203672e2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:56 +0200 Subject: [PATCH 225/352] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() If vring doesn't use the DMA API, KMSAN is unable to tell whether the memory is initialized by hardware. Explicitly call kmsan_handle_dma() from vring_map_one_sg() in this case to prevent false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-23-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Michael S. Tsirkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/virtio/virtio_ring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 4620e9d79dde8..8974c34b40fda 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -352,8 +353,15 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_dma_api) { + /* + * If DMA is not used, KMSAN doesn't know that the scatterlist + * is initialized by the hardware. Explicitly check/unpoison it + * depending on the direction. + */ + kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); return (dma_addr_t)sg_phys(sg); + } /* * We can't use dma_map_sg, because we don't use scatterlists in From 4b589aa601dd1435ea373b4372fef3b57555d414 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:57 +0200 Subject: [PATCH 226/352] kmsan: handle memory sent to/from USB Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the data copied to the kernel from a USB device as initialized, or checks the data sent to the device for being initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/usb/core/urb.c | 2 ++ include/linux/kmsan.h | 15 +++++++++++++++ mm/kmsan/hooks.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 33d62d7e3929f..9f3c54032556e 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL | URB_DMA_SG_COMBINED); urb->transfer_flags |= (is_out ? URB_DIR_OUT : URB_DIR_IN); + kmsan_handle_urb(urb, is_out); if (xfertype != USB_ENDPOINT_XFER_CONTROL && dev->state < USB_STATE_CONFIGURED) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index dac296da45c55..c473e0e21683c 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -18,6 +18,7 @@ struct page; struct kmem_cache; struct task_struct; struct scatterlist; +struct urb; #ifdef CONFIG_KMSAN @@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir); +/** + * kmsan_handle_urb() - Handle a USB data transfer. + * @urb: struct urb pointer. + * @is_out: data transfer direction (true means output to hardware). + * + * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise, + * KMSAN initializes the transfer buffer. + */ +void kmsan_handle_urb(const struct urb *urb, bool is_out); + #else static inline void kmsan_init_shadow(void) @@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, { } +static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 563c09443a37a..79d7e73e2cfd8 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../internal.h" #include "../slab.h" @@ -245,6 +246,21 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} + static void kmsan_handle_dma_page(const void *addr, size_t size, enum dma_data_direction dir) { From 00e49eb5e34dcd04c1d5cb731216794e46bf5fdf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:58 +0200 Subject: [PATCH 227/352] kmsan: add tests for KMSAN The testing module triggers KMSAN warnings in different cases and checks that the errors are properly reported, using console probes to capture the tool's output. Link: https://lkml.kernel.org/r/20220915150417.722975-25-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig.kmsan | 12 + mm/kmsan/Makefile | 4 + mm/kmsan/kmsan_test.c | 581 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+) create mode 100644 mm/kmsan/kmsan_test.c diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan index 5b19dbd34d76e..b2489dd6503fa 100644 --- a/lib/Kconfig.kmsan +++ b/lib/Kconfig.kmsan @@ -47,4 +47,16 @@ config KMSAN_CHECK_PARAM_RETVAL may potentially report errors in corner cases when non-instrumented functions call instrumented ones. +config KMSAN_KUNIT_TEST + tristate "KMSAN integration test suite" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS + depends on TRACEPOINTS && KUNIT + help + Test suite for KMSAN, testing various error detection scenarios, + and checking that reports are correctly output to console. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. + endif diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 401acb1a491ce..98eab2856626f 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -22,3 +22,7 @@ CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) + +obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o +KMSAN_SANITIZE_kmsan_test.o := y +CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c new file mode 100644 index 0000000000000..9a29ea2dbfb9b --- /dev/null +++ b/mm/kmsan/kmsan_test.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KMSAN. + * For each test case checks the presence (or absence) of generated reports. + * Relies on 'console' tracepoint to capture reports as they appear in the + * kernel log. + * + * Copyright (C) 2021-2022, Google LLC. + * Author: Alexander Potapenko + * + */ + +#include +#include "kmsan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(int, per_cpu_var); + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + bool available; + bool ignore; /* Stop console output collection. */ + char header[256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + + if (observed.ignore) + return; + spin_lock_irqsave(&observed.lock, flags); + + if (strnstr(buf, "BUG: KMSAN: ", len)) { + /* + * KMSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.header, buf, + min(len + 1, sizeof(observed.header))); + WRITE_ONCE(observed.available, true); + observed.ignore = true; + } + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.available); +} + +/* Information we expect in a report. */ +struct expect_report { + const char *error_type; /* Error type. */ + /* + * Kernel symbol from the error header, or NULL if no report is + * expected. + */ + const char *symbol; +}; + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + typeof(observed.header) expected_header; + unsigned long flags; + bool ret = false; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available() || !r->symbol) + return (!report_available() && !r->symbol); + + /* Generate expected report contents. */ + + /* Title */ + cur = expected_header; + end = &expected_header[sizeof(expected_header) - 1]; + + cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); + + scnprintf(cur, end - cur, " in %s", r->symbol); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expected_header, '+'); + if (cur) + *cur = '\0'; + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.header, expected_header); +out: + spin_unlock_irqrestore(&observed.lock, flags); + + return ret; +} + +/* ===== Test cases ===== */ + +/* Prevent replacing branch with select in LLVM. */ +static noinline void check_true(char *arg) +{ + pr_info("%s is true\n", arg); +} + +static noinline void check_false(char *arg) +{ + pr_info("%s is false\n", arg); +} + +#define USE(x) \ + do { \ + if (x) \ + check_true(#x); \ + else \ + check_false(#x); \ + } while (0) + +#define EXPECTATION_ETYPE_FN(e, reason, fn) \ + struct expect_report e = { \ + .error_type = reason, \ + .symbol = fn, \ + } + +#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL) +#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \ + EXPECTATION_ETYPE_FN(e, "uninit-value", fn) +#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__) +#define EXPECTATION_USE_AFTER_FREE(e) \ + EXPECTATION_ETYPE_FN(e, "use-after-free", __func__) + +/* Test case: ensure that kmalloc() returns uninitialized memory. */ +static void test_uninit_kmalloc(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + int *ptr; + + kunit_info(test, "uninitialized kmalloc test (UMR report)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that kmalloc'ed memory becomes initialized after memset(). + */ +static void test_init_kmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kmalloc test (no reports)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + memset(ptr, 0, sizeof(*ptr)); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that kzalloc() returns initialized memory. */ +static void test_init_kzalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kzalloc test (no reports)\n"); + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables are uninitialized by default. */ +static void test_uninit_stack_var(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int cond; + + kunit_info(test, "uninitialized stack variable (UMR report)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables with initializers are initialized. */ +static void test_init_stack_var(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + volatile int cond = 1; + + kunit_info(test, "initialized stack variable (no reports)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void two_param_fn_2(int arg1, int arg2) +{ + USE(arg1); + USE(arg2); +} + +static noinline void one_param_fn(int arg) +{ + two_param_fn_2(arg, arg); + USE(arg); +} + +static noinline void two_param_fn(int arg1, int arg2) +{ + int init = 0; + + one_param_fn(init); + USE(arg1); + USE(arg2); +} + +static void test_params(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to two_param_fn(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_params"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn"); +#endif + volatile int uninit, init = 1; + + kunit_info(test, + "uninit passed through a function parameter (UMR report)\n"); + two_param_fn(uninit, init); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static int signed_sum3(int a, int b, int c) +{ + return a + b + c; +} + +/* + * Test case: ensure that uninitialized values are tracked through function + * arguments. + */ +static void test_uninit_multiple_params(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile char b = 3, c; + volatile int a; + + kunit_info(test, "uninitialized local passed to fn (UMR report)\n"); + USE(signed_sum3(a, b, c)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Helper function to make an array uninitialized. */ +static noinline void do_uninit_local_array(char *array, int start, int stop) +{ + volatile char uninit; + + for (int i = start; i < stop; i++) + array[i] = uninit; +} + +/* + * Test case: ensure kmsan_check_memory() reports an error when checking + * uninitialized memory. + */ +static void test_uninit_kmsan_check_memory(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory"); + volatile char local_array[8]; + + kunit_info( + test, + "kmsan_check_memory() called on uninit local (UMR report)\n"); + do_uninit_local_array((char *)local_array, 5, 7); + + kmsan_check_memory((char *)local_array, 8); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: check that a virtual memory range created with vmap() from + * initialized pages is still considered as initialized. + */ +static void test_init_kmsan_vmap_vunmap(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + const int npages = 2; + struct page **pages; + void *vbuf; + + kunit_info(test, "pages initialized via vmap (no reports)\n"); + + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + for (int i = 0; i < npages; i++) + pages[i] = alloc_page(GFP_KERNEL); + vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + memset(vbuf, 0xfe, npages * PAGE_SIZE); + for (int i = 0; i < npages; i++) + kmsan_check_memory(page_address(pages[i]), PAGE_SIZE); + + if (vbuf) + vunmap(vbuf); + for (int i = 0; i < npages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memset() can initialize a buffer allocated via + * vmalloc(). + */ +static void test_init_vmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int npages = 8; + char *buf; + + kunit_info(test, "vmalloc buffer can be initialized (no reports)\n"); + buf = vmalloc(PAGE_SIZE * npages); + buf[0] = 1; + memset(buf, 0xfe, PAGE_SIZE * npages); + USE(buf[0]); + for (int i = 0; i < npages; i++) + kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE); + vfree(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that use-after-free reporting works. */ +static void test_uaf(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile int value; + volatile int *var; + + kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n"); + var = kmalloc(80, GFP_KERNEL); + var[3] = 0xfeedface; + kfree((int *)var); + /* Copy the invalid value before checking it. */ + value = var[3]; + USE(value); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that uninitialized values are propagated through per-CPU + * memory. + */ +static void test_percpu_propagate(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int uninit, check; + + kunit_info(test, + "uninit local stored to per_cpu memory (UMR report)\n"); + + this_cpu_write(per_cpu_var, uninit); + check = this_cpu_read(per_cpu_var); + USE(check); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that passing uninitialized values to printk() leads to an + * error report. + */ +static void test_printk(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to pr_info(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "number"); +#endif + volatile int uninit; + + kunit_info(test, "uninit local passed to pr_info() (UMR report)\n"); + pr_info("%px contains %d\n", &uninit, uninit); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and `dst`. + */ +static void test_memcpy_aligned_to_aligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned"); + volatile int uninit_src; + volatile int dst = 0; + + kunit_info( + test, + "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst, sizeof(dst)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the first of the two values. + */ +static void test_memcpy_aligned_to_unaligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)dst, 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the second of the two values. + */ +static void test_memcpy_aligned_to_unaligned2(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_memcpy_aligned_to_unaligned2"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void fibonacci(int *array, int size, int start) { + if (start < 2 || (start == size)) + return; + array[start] = array[start - 1] + array[start - 2]; + fibonacci(array, size, start + 1); +} + +static void test_long_origin_chain(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_long_origin_chain"); + /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */ + volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2]; + int last = ARRAY_SIZE(accum) - 1; + + kunit_info( + test, + "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n"); + /* + * We do not set accum[1] to 0, so the uninitializedness will be carried + * over to accum[2..last]. + */ + accum[0] = 1; + fibonacci((int *)accum, ARRAY_SIZE(accum), 2); + kmsan_check_memory((void *)&accum[last], sizeof(int)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static struct kunit_case kmsan_test_cases[] = { + KUNIT_CASE(test_uninit_kmalloc), + KUNIT_CASE(test_init_kmalloc), + KUNIT_CASE(test_init_kzalloc), + KUNIT_CASE(test_uninit_stack_var), + KUNIT_CASE(test_init_stack_var), + KUNIT_CASE(test_params), + KUNIT_CASE(test_uninit_multiple_params), + KUNIT_CASE(test_uninit_kmsan_check_memory), + KUNIT_CASE(test_init_kmsan_vmap_vunmap), + KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uaf), + KUNIT_CASE(test_percpu_propagate), + KUNIT_CASE(test_printk), + KUNIT_CASE(test_memcpy_aligned_to_aligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned2), + KUNIT_CASE(test_long_origin_chain), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + observed.header[0] = '\0'; + observed.ignore = false; + observed.available = false; + spin_unlock_irqrestore(&observed.lock, flags); + + return 0; +} + +static void test_exit(struct kunit *test) +{ +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kmsan_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kmsan_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kmsan_test_suite = { + .name = "kmsan", + .test_cases = kmsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kmsan_suite_init, + .suite_exit = kmsan_suite_exit, +}; +kunit_test_suites(&kmsan_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexander Potapenko "); From 51d69e6bccd9f6a394d4eff077c50cd5207d0ea8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:59 +0200 Subject: [PATCH 228/352] kmsan: disable strscpy() optimization under KMSAN Disable the efficient 8-byte reading under KMSAN to avoid false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-26-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/string.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/string.c b/lib/string.c index 6f334420f6871..3371d26a0e390 100644 --- a/lib/string.c +++ b/lib/string.c @@ -197,6 +197,14 @@ ssize_t strscpy(char *dest, const char *src, size_t count) max = 0; #endif + /* + * read_word_at_a_time() below may read uninitialized bytes after the + * trailing zero and use them in comparisons. Disable this optimization + * under KMSAN to prevent false positive reports. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + max = 0; + while (max >= sizeof(unsigned long)) { unsigned long c, data; From 0dc225ff49fec371b46ec0074f0e538b5a6b9840 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:00 +0200 Subject: [PATCH 229/352] crypto: kmsan: disable accelerated configs under KMSAN KMSAN is unable to understand when initialized values come from assembly. Disable accelerated configs in KMSAN builds to prevent false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-27-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- crypto/Kconfig | 30 ++++++++++++++++++++++++++++++ drivers/net/Kconfig | 1 + 2 files changed, 31 insertions(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index bb427a835e44a..182fb817ebb52 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -319,6 +319,7 @@ config CRYPTO_CURVE25519 config CRYPTO_CURVE25519_X86 tristate "x86_64 accelerated Curve25519 scalar multiplication library" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -367,11 +368,13 @@ config CRYPTO_AEGIS128 config CRYPTO_AEGIS128_SIMD bool "Support SIMD acceleration for AEGIS-128" depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON) + depends on !KMSAN # avoid false positives from assembly default y config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_SIMD help @@ -517,6 +520,7 @@ config CRYPTO_NHPOLY1305 config CRYPTO_NHPOLY1305_SSE2 tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help SSE2 optimized implementation of the hash function used by the @@ -525,6 +529,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help AVX2 optimized implementation of the hash function used by the @@ -649,6 +654,7 @@ config CRYPTO_CRC32C config CRYPTO_CRC32C_INTEL tristate "CRC32c INTEL hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help In Intel processor with SSE4.2 supported, the processor will @@ -689,6 +695,7 @@ config CRYPTO_CRC32 config CRYPTO_CRC32_PCLMUL tristate "CRC32 PCLMULQDQ hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH select CRC32 help @@ -748,6 +755,7 @@ config CRYPTO_BLAKE2B config CRYPTO_BLAKE2S_X86 bool "BLAKE2s digest algorithm (x86 accelerated version)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_BLAKE2S_GENERIC select CRYPTO_ARCH_HAVE_LIB_BLAKE2S @@ -762,6 +770,7 @@ config CRYPTO_CRCT10DIF config CRYPTO_CRCT10DIF_PCLMUL tristate "CRCT10DIF PCLMULQDQ hardware acceleration" depends on X86 && 64BIT && CRC_T10DIF + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help For x86_64 processors with SSE4.2 and PCLMULQDQ supported, @@ -831,6 +840,7 @@ config CRYPTO_POLY1305 config CRYPTO_POLY1305_X86_64 tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 help @@ -920,6 +930,7 @@ config CRYPTO_SHA1 config CRYPTO_SHA1_SSSE3 tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA1 select CRYPTO_HASH help @@ -931,6 +942,7 @@ config CRYPTO_SHA1_SSSE3 config CRYPTO_SHA256_SSSE3 tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA256 select CRYPTO_HASH help @@ -943,6 +955,7 @@ config CRYPTO_SHA256_SSSE3 config CRYPTO_SHA512_SSSE3 tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA512 select CRYPTO_HASH help @@ -1168,6 +1181,7 @@ config CRYPTO_WP512 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "GHASH hash function (CLMUL-NI accelerated)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CRYPTD help This is the x86_64 CLMUL-NI accelerated implementation of @@ -1228,6 +1242,7 @@ config CRYPTO_AES_TI config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_ALGAPI @@ -1369,6 +1384,7 @@ config CRYPTO_BLOWFISH_COMMON config CRYPTO_BLOWFISH_X86_64 tristate "Blowfish cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -1399,6 +1415,7 @@ config CRYPTO_CAMELLIA config CRYPTO_CAMELLIA_X86_64 tristate "Camellia cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -1415,6 +1432,7 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 select CRYPTO_SIMD @@ -1433,6 +1451,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Camellia cipher algorithm module (x86_64/AES-NI/AVX2). @@ -1478,6 +1497,7 @@ config CRYPTO_CAST5 config CRYPTO_CAST5_AVX_X86_64 tristate "CAST5 (CAST-128) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON @@ -1501,6 +1521,7 @@ config CRYPTO_CAST6 config CRYPTO_CAST6_AVX_X86_64 tristate "CAST6 (CAST-256) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON @@ -1534,6 +1555,7 @@ config CRYPTO_DES_SPARC64 config CRYPTO_DES3_EDE_X86_64 tristate "Triple DES EDE cipher algorithm (x86-64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -1604,6 +1626,7 @@ config CRYPTO_CHACHA20 config CRYPTO_CHACHA20_X86_64 tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA @@ -1674,6 +1697,7 @@ config CRYPTO_SERPENT config CRYPTO_SERPENT_SSE2_X86_64 tristate "Serpent cipher algorithm (x86_64/SSE2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1693,6 +1717,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Serpent cipher algorithm (i586/SSE2)" depends on X86 && !64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1712,6 +1737,7 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1732,6 +1758,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SERPENT_AVX_X86_64 help Serpent cipher algorithm, by Anderson, Biham & Knudsen. @@ -1876,6 +1903,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Twofish cipher algorithm (x86_64)" depends on (X86 || UML_X86) && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -1893,6 +1921,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Twofish cipher algorithm (x86_64, 3-way parallel)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -1913,6 +1942,7 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Twofish cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 94c889802566a..2aaf02bfe6f7e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,6 +76,7 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 + depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE select CRYPTO From 4f4f0628217fe82b92d087ed387d692e72248765 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:01 +0200 Subject: [PATCH 230/352] kmsan: disable physical page merging in biovec KMSAN metadata for adjacent physical pages may not be adjacent, therefore accessing such pages together may lead to metadata corruption. We disable merging pages in biovec to prevent such corruptions. Link: https://lkml.kernel.org/r/20220915150417.722975-28-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/blk.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk.h b/block/blk.h index d7142c4d2fefb..af02b93c1dba5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -88,6 +88,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + /* + * Merging adjacent physical pages may not work correctly under KMSAN + * if their metadata pages aren't adjacent. Just disable merging. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) From 480f8838e6330e2fd80e0cc8f6df2757022826c2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:02 +0200 Subject: [PATCH 231/352] block: kmsan: skip bio block merging logic for KMSAN KMSAN doesn't allow treating adjacent memory pages as such, if they were allocated by different alloc_pages() calls. The block layer however does so: adjacent pages end up being used together. To prevent this, make page_is_mergeable() return false under KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-29-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Eric Biggers Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bio.c b/block/bio.c index 6c6110f7054e1..7d10476ceefd4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -869,6 +869,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); if (*same_page) return true; + else if (IS_ENABLED(CONFIG_KMSAN)) + return false; return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } From 0a75c8f238919766b73259eba5e396d92b669710 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:03 +0200 Subject: [PATCH 232/352] kcov: kmsan: unpoison area->list in kcov_remote_area_put() KMSAN does not instrument kernel/kcov.c for performance reasons (with CONFIG_KCOV=y virtually every place in the kernel invokes kcov instrumentation). Therefore the tool may miss writes from kcov.c that initialize memory. When CONFIG_DEBUG_LIST is enabled, list pointers from kernel/kcov.c are passed to instrumented helpers in lib/list_debug.c, resulting in false positives. To work around these reports, we unpoison the contents of area->list after initializing it. Link: https://lkml.kernel.org/r/20220915150417.722975-30-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/kcov.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index e19c84b02452e..e5cd09fd8a050 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); + /* + * KMSAN doesn't instrument this file, so it may not know area->list + * is initialized. Unpoison it explicitly to avoid reports in + * kcov_remote_area_get(). + */ + kmsan_unpoison_memory(&area->list, sizeof(area->list)); } static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) From 30664e5e37b9b0da98bbe9ad72b35668891f63d7 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:04 +0200 Subject: [PATCH 233/352] security: kmsan: fix interoperability with auto-initialization Heap and stack initialization is great, but not when we are trying uses of uninitialized memory. When the kernel is built with KMSAN, having kernel memory initialization enabled may introduce false negatives. We disable CONFIG_INIT_STACK_ALL_PATTERN and CONFIG_INIT_STACK_ALL_ZERO under CONFIG_KMSAN, making it impossible to auto-initialize stack variables in KMSAN builds. We also disable CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON to prevent accidental use of heap auto-initialization. We however still let the users enable heap auto-initialization at boot-time (by setting init_on_alloc=1 or init_on_free=1), in which case a warning is printed. Link: https://lkml.kernel.org/r/20220915150417.722975-31-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++++ security/Kconfig.hardening | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62d88813f3cfb..ef3f51c9815de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -938,6 +938,10 @@ void init_mem_debugging_and_hardening(void) else static_branch_disable(&init_on_free); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index bd2aabb2c60f9..2739a6776454e 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -106,6 +106,7 @@ choice config INIT_STACK_ALL_PATTERN bool "pattern-init everything (strongest)" depends on CC_HAS_AUTO_VAR_INIT_PATTERN + depends on !KMSAN help Initializes everything on the stack (including padding) with a specific debug value. This is intended to eliminate @@ -124,6 +125,7 @@ choice config INIT_STACK_ALL_ZERO bool "zero-init everything (strongest and safest)" depends on CC_HAS_AUTO_VAR_INIT_ZERO + depends on !KMSAN help Initializes everything on the stack (including padding) with a zero value. This is intended to eliminate all @@ -218,6 +220,7 @@ config STACKLEAK_RUNTIME_DISABLE config INIT_ON_ALLOC_DEFAULT_ON bool "Enable heap memory zeroing on allocation by default" + depends on !KMSAN help This has the effect of setting "init_on_alloc=1" on the kernel command line. This can be disabled with "init_on_alloc=0". @@ -230,6 +233,7 @@ config INIT_ON_ALLOC_DEFAULT_ON config INIT_ON_FREE_DEFAULT_ON bool "Enable heap memory zeroing on free by default" + depends on !KMSAN help This has the effect of setting "init_on_free=1" on the kernel command line. This can be disabled with "init_on_free=0". From 0aab0c965c9049c7288b204b7aa679140091d183 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:05 +0200 Subject: [PATCH 234/352] objtool: kmsan: list KMSAN API functions as uaccess-safe KMSAN inserts API function calls in a lot of places (function entries and exits, local variables, memory accesses), so they may get called from the uaccess regions as well. KMSAN API functions are used to update the metadata (shadow/origin pages) for kernel memory accesses. The metadata pages for kernel pointers are also located in the kernel memory, so touching them is not a problem. For userspace pointers, no metadata is allocated. If an API function is supposed to read or modify the metadata, it does so for kernel pointers and ignores userspace pointers. If an API function is supposed to return a pair of metadata pointers for the instrumentation to use (like all __msan_metadata_ptr_for_TYPE_SIZE() functions do), it returns the allocated metadata for kernel pointers and special dummy buffers residing in the kernel memory for userspace pointers. As a result, none of KMSAN API functions perform userspace accesses, but since they might be called from UACCESS regions they use user_access_save/restore(). Link: https://lkml.kernel.org/r/20220915150417.722975-32-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/objtool/check.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e55fdf952a3a1..7c048c11ce7da 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1062,6 +1062,26 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_cmp4", "__sanitizer_cov_trace_cmp8", "__sanitizer_cov_trace_switch", + /* KMSAN */ + "kmsan_copy_to_user", + "kmsan_report", + "kmsan_unpoison_entry_regs", + "kmsan_unpoison_memory", + "__msan_chain_origin", + "__msan_get_context_state", + "__msan_instrument_asm_store", + "__msan_metadata_ptr_for_load_1", + "__msan_metadata_ptr_for_load_2", + "__msan_metadata_ptr_for_load_4", + "__msan_metadata_ptr_for_load_8", + "__msan_metadata_ptr_for_load_n", + "__msan_metadata_ptr_for_store_1", + "__msan_metadata_ptr_for_store_2", + "__msan_metadata_ptr_for_store_4", + "__msan_metadata_ptr_for_store_8", + "__msan_metadata_ptr_for_store_n", + "__msan_poison_alloca", + "__msan_warning", /* UBSAN */ "ubsan_type_mismatch_common", "__ubsan_handle_type_mismatch", From 5aa79e67e9ee805aa86e42ce895a8c4789ba602a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:06 +0200 Subject: [PATCH 235/352] x86: kmsan: disable instrumentation of unsupported code Instrumenting some files with KMSAN will result in kernel being unable to link, boot or crashing at runtime for various reasons (e.g. infinite recursion caused by instrumentation hooks calling instrumented code again). Completely omit KMSAN instrumentation in the following places: - arch/x86/boot and arch/x86/realmode/rm, as KMSAN doesn't work for i386; - arch/x86/entry/vdso, which isn't linked with KMSAN runtime; - three files in arch/x86/kernel - boot problems; - arch/x86/mm/cpu_entry_area.c - recursion. Link: https://lkml.kernel.org/r/20220915150417.722975-33-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/boot/Makefile | 1 + arch/x86/boot/compressed/Makefile | 1 + arch/x86/entry/vdso/Makefile | 3 +++ arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/mm/Makefile | 2 ++ arch/x86/realmode/rm/Makefile | 1 + 7 files changed, 11 insertions(+) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ffec8bb01ba8c..9860ca5979f8a 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Kernel does not boot with kcov instrumentation here. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 35ce1a64068b7..3a261abb6d158 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -20,6 +20,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 12f6c4d714cd6..ce4eb7e44e5b8 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -11,6 +11,9 @@ include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KMSAN_SANITIZE_vclock_gettime.o := n +KMSAN_SANITIZE_vgetcpu.o := n + UBSAN_SANITIZE := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd73..ac564c5d7b1f0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,8 @@ KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be5..f10a921ee7565 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 829c1409ffbde..afb6f7187dad0 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,6 +14,8 @@ KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. KCSAN_SANITIZE := n +# Avoid recursion by not calling KMSAN hooks for CEA code. +KMSAN_SANITIZE_cpu_entry_area.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 83f1b6a56449f..f614009d3e4e2 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -10,6 +10,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. From d28af13e493945b39944d299208f9d744a32e5d5 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:07 +0200 Subject: [PATCH 236/352] x86: kmsan: skip shadow checks in __switch_to() When instrumenting functions, KMSAN obtains the per-task state (mostly pointers to metadata for function arguments and return values) once per function at its beginning, using the `current` pointer. Every time the instrumented function calls another function, this state (`struct kmsan_context_state`) is updated with shadow/origin data of the passed and returned values. When `current` changes in the low-level arch code, instrumented code can not notice that, and will still refer to the old state, possibly corrupting it or using stale data. This may result in false positive reports. To deal with that, we need to apply __no_kmsan_checks to the functions performing context switching - this will result in skipping all KMSAN shadow checks and marking newly created values as initialized, preventing all false positive reports in those functions. False negatives are still possible, but we expect them to be rare and impersistent. Link: https://lkml.kernel.org/r/20220915150417.722975-34-glider@google.com Suggested-by: Marco Elver Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1962008fe7437..6b3418bff3261 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { From b61252296f904e8b98d3301a3e4dd3baf7122d77 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:08 +0200 Subject: [PATCH 237/352] x86: kmsan: handle open-coded assembly in lib/iomem.c KMSAN cannot intercept memory accesses within asm() statements. That's why we add kmsan_unpoison_memory() and kmsan_check_memory() to hint it how to handle memory copied from/to I/O memory. Link: https://lkml.kernel.org/r/20220915150417.722975-35-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/lib/iomem.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 3e2f33fc33de2..e0411a3774d49 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -1,6 +1,7 @@ #include #include #include +#include #define movs(type,to,from) \ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory") @@ -37,6 +38,8 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si n-=2; } rep_movs(to, (const void *)from, n); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(to, n); } static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n) @@ -44,6 +47,8 @@ static void string_memcpy_toio(volatile void __iomem *to, const void *from, size if (unlikely(!n)) return; + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(from, n); /* Align any unaligned destination IO */ if (unlikely(1 & (unsigned long)to)) { movs("b", to, from); From ddcbd097eb8a20cb5d8691b15e2990654431bc01 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:09 +0200 Subject: [PATCH 238/352] x86: kmsan: use __msan_ string functions where possible. Unless stated otherwise (by explicitly calling __memcpy(), __memset() or __memmove()) we want all string functions to call their __msan_ versions (e.g. __msan_memcpy() instead of memcpy()), so that shadow and origin values are updated accordingly. Bootloader must still use the default string functions to avoid crashes. Link: https://lkml.kernel.org/r/20220915150417.722975-36-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 23 +++++++++++++++++++++-- include/linux/fortify-string.h | 2 ++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 6e450827f677a..3b87d889b6e16 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -11,11 +11,23 @@ function. */ #define __HAVE_ARCH_MEMCPY 1 +#if defined(__SANITIZE_MEMORY__) +#undef memcpy +void *__msan_memcpy(void *dst, const void *src, size_t size); +#define memcpy __msan_memcpy +#else extern void *memcpy(void *to, const void *from, size_t len); +#endif extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET +#if defined(__SANITIZE_MEMORY__) +extern void *__msan_memset(void *s, int c, size_t n); +#undef memset +#define memset __msan_memset +#else void *memset(void *s, int c, size_t n); +#endif void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMSET16 @@ -55,7 +67,13 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE +#if defined(__SANITIZE_MEMORY__) +#undef memmove +void *__msan_memmove(void *dest, const void *src, size_t len); +#define memmove __msan_memmove +#else void *memmove(void *dest, const void *src, size_t count); +#endif void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); @@ -64,8 +82,7 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); -#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) - +#if (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) /* * For files that not instrumented (e.g. mm/slub.c) we * should use not instrumented version of mem* functions. @@ -73,7 +90,9 @@ int strcmp(const char *cs, const char *ct); #undef memcpy #define memcpy(dst, src, len) __memcpy(dst, src, len) +#undef memmove #define memmove(dst, src, len) __memmove(dst, src, len) +#undef memset #define memset(s, c, n) __memset(s, c, n) #ifndef __NO_FORTIFY diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 3b401fa0f3746..6c8a1a29d0b63 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -285,8 +285,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * __builtin_object_size() must be captured here to avoid evaluating argument * side-effects further into the macro layers. */ +#ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __builtin_object_size(p, 0), __builtin_object_size(p, 1)) +#endif /* * To make sure the compiler can enforce protection against buffer overflows, From 43625d36d6e666761d084951a5752a81ad818847 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:10 +0200 Subject: [PATCH 239/352] x86: kmsan: sync metadata pages on page fault KMSAN assumes shadow and origin pages for every allocated page are accessible. For pages between [VMALLOC_START, VMALLOC_END] those metadata pages start at KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START, therefore we must sync a bigger memory region. Link: https://lkml.kernel.org/r/20220915150417.722975-37-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa71a5d12e872..d728791be8ace 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,6 +284,27 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + __arch_sync_kernel_mappings(start, end); +#ifdef CONFIG_KMSAN + /* + * KMSAN maintains two additional metadata page mappings for the + * [VMALLOC_START, VMALLOC_END) range. These mappings start at + * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and + * have to be synced together with the vmalloc memory mapping. + */ + if (start >= VMALLOC_START && end < VMALLOC_END) { + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, + end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, + end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); + } +#endif +} + static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; From 6361d651646c7e9c5eddf322302978085f66851c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:11 +0200 Subject: [PATCH 240/352] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN This is needed to allow memory tools like KASAN and KMSAN see the memory accesses from the checksum code. Without CONFIG_GENERIC_CSUM the tools can't see memory accesses originating from handwritten assembly code. For KASAN it's a question of detecting more bugs, for KMSAN using the C implementation also helps avoid false positives originating from seemingly uninitialized checksum values. Link: https://lkml.kernel.org/r/20220915150417.722975-38-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/checksum.h | 16 ++++++++++------ arch/x86/lib/Makefile | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 674d694a665ef..eb158eb8a985f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -325,6 +325,10 @@ config GENERIC_ISA_DMA def_bool y depends on ISA_DMA_API +config GENERIC_CSUM + bool + default y if KMSAN || KASAN + config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index bca625a60186c..6df6ece8a28ec 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER -#define _HAVE_ARCH_CSUM_AND_COPY -#ifdef CONFIG_X86_32 -# include +#ifdef CONFIG_GENERIC_CSUM +# include #else -# include +# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +# define HAVE_CSUM_COPY_USER +# define _HAVE_ARCH_CSUM_AND_COPY +# ifdef CONFIG_X86_32 +# include +# else +# include +# endif #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f76747862bd2e..7ba5f61d72735 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -65,7 +65,9 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif else obj-y += iomap_copy_64.o +ifneq ($(CONFIG_GENERIC_CSUM),y) lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o +endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o From 3cfba3f4dd237deda044e1d74457e00b8d9b61d1 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:12 +0200 Subject: [PATCH 241/352] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS dentry_string_cmp() calls read_word_at_a_time(), which might read uninitialized bytes to optimize string comparisons. Disabling CONFIG_DCACHE_WORD_ACCESS should prohibit this optimization, as well as (probably) similar ones. Link: https://lkml.kernel.org/r/20220915150417.722975-39-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Andrey Konovalov Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb158eb8a985f..edf7f12935d72 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -129,7 +129,9 @@ config X86 select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select DCACHE_WORD_ACCESS + # Word-size accesses may read uninitialized data past the trailing \0 + # in strings and cause false KMSAN reports. + select DCACHE_WORD_ACCESS if !KMSAN select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT From 7b8fdab588ced71585dc0897fa0f3b65408c93a6 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:13 +0200 Subject: [PATCH 242/352] x86: kmsan: don't instrument stack walking functions Upon function exit, KMSAN marks local variables as uninitialized. Further function calls may result in the compiler creating the stack frame where these local variables resided. This results in frame pointers being marked as uninitialized data, which is normally correct, because they are not stack-allocated. However stack unwinding functions are supposed to read and dereference the frame pointers, in which case KMSAN might be reporting uses of uninitialized values. To work around that, we mark update_stack_state(), unwind_next_frame() and show_trace_log_lvl() with __no_kmsan_checks, preventing all KMSAN reports inside those functions and making them return initialized values. Link: https://lkml.kernel.org/r/20220915150417.722975-40-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/dumpstack.c | 6 ++++++ arch/x86/kernel/unwind_frame.c | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index afae4dd774951..476eb504084e4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -177,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 8e1c50c86e5db..d8ba93778ae32 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; From 57de8c656e066ef7ae1587909f5c18a12b747431 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:14 +0200 Subject: [PATCH 243/352] entry: kmsan: introduce kmsan_unpoison_entry_regs() struct pt_regs passed into IRQ entry code is set up by uninstrumented asm functions, therefore KMSAN may not notice the registers are initialized. kmsan_unpoison_entry_regs() unpoisons the contents of struct pt_regs, preventing potential false positives. Unlike kmsan_unpoison_memory(), it can be called under kmsan_in_runtime(), which is often the case in IRQ entry code. Link: https://lkml.kernel.org/r/20220915150417.722975-41-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 15 +++++++++++++++ kernel/entry/common.c | 5 +++++ mm/kmsan/hooks.c | 26 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index c473e0e21683c..e38ae3c346184 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -214,6 +214,17 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, */ void kmsan_handle_urb(const struct urb *urb, bool is_out); +/** + * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code. + * @regs: struct pt_regs pointer received from assembly code. + * + * KMSAN unpoisons the contents of the passed pt_regs, preventing potential + * false positive reports. Unlike kmsan_unpoison_memory(), + * kmsan_unpoison_entry_regs() can be called from the regions where + * kmsan_in_runtime() returns true, which is the case in early entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs); + #else static inline void kmsan_init_shadow(void) @@ -310,6 +321,10 @@ static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) { } +static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 063068a9ea9b3..846add8394c41 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) user_exit_irqoff(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); } @@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 79d7e73e2cfd8..35f6b6e6a908c 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -348,6 +348,32 @@ void kmsan_unpoison_memory(const void *address, size_t size) } EXPORT_SYMBOL(kmsan_unpoison_memory); +/* + * Version of kmsan_unpoison_memory() that can be called from within the KMSAN + * runtime. + * + * Non-instrumented IRQ entry functions receive struct pt_regs from assembly + * code. Those regs need to be unpoisoned, otherwise using them will result in + * false positives. + * Using kmsan_unpoison_memory() is not an option in entry code, because the + * return value of in_task() is inconsistent - as a result, certain calls to + * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that + * the registers are unpoisoned even if kmsan_in_runtime() is true in the early + * entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + KMSAN_POISON_NOCHECK); + user_access_restore(ua_flags); +} + void kmsan_check_memory(const void *addr, size_t size) { if (!kmsan_enabled) From 8a30f7ef4375bc86cbe5e1f9ce227a70b9240460 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:15 +0200 Subject: [PATCH 244/352] bpf: kmsan: initialize BPF registers with zeroes When executing BPF programs, certain registers may get passed uninitialized to helper functions. E.g. when performing a JMP_CALL, registers BPF_R1-BPF_R5 are always passed to the helper, no matter how many of them are actually used. Passing uninitialized values as function parameters is technically undefined behavior, so we work around it by always initializing the registers. Link: https://lkml.kernel.org/r/20220915150417.722975-42-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb7..547d139ab98af 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2002,7 +2002,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_EXT_REG]; \ + u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ From d1e1c70a51636f1a9a2fb914aa1c208148bbf9f6 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:16 +0200 Subject: [PATCH 245/352] mm: fs: initialize fsdata passed to write_begin/write_end interface Functions implementing the a_ops->write_end() interface accept the `void *fsdata` parameter that is supposed to be initialized by the corresponding a_ops->write_begin() (which accepts `void **fsdata`). However not all a_ops->write_begin() implementations initialize `fsdata` unconditionally, so it may get passed uninitialized to a_ops->write_end(), resulting in undefined behavior. Fix this by initializing fsdata with NULL before the call to write_begin(), rather than doing so in all possible a_ops implementations. This patch covers only the following cases found by running x86 KMSAN under syzkaller: - generic_perform_write() - cont_expand_zero() and generic_cont_expand_simple() - page_symlink() Other cases of passing uninitialized fsdata may persist in the codebase. Link: https://lkml.kernel.org/r/20220915150417.722975-43-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/buffer.c | 4 ++-- fs/namei.c | 2 +- mm/filemap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b4c9fff3ab6c9..b55252078e7bf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2341,7 +2341,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2367,7 +2367,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db23..076ae96ca0b14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5088,7 +5088,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/mm/filemap.c b/mm/filemap.c index f27c93a581ab4..ec17bd1a3bb77 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3720,7 +3720,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ - void *fsdata; + void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, From da3d0535655a0d81ee1552804ef48716ec0db157 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:17 +0200 Subject: [PATCH 246/352] x86: kmsan: enable KMSAN builds for x86 Make KMSAN usable by adding the necessary Kconfig bits. Also declare x86-specific functions checking address validity in arch/x86/include/asm/kmsan.h. Link: https://lkml.kernel.org/r/20220915150417.722975-44-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kmsan.h | 55 ++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 arch/x86/include/asm/kmsan.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index edf7f12935d72..dce94e84199d3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -169,6 +169,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 select HAVE_ARCH_KFENCE + select HAVE_ARCH_KMSAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h new file mode 100644 index 0000000000000..a790b865d0a68 --- /dev/null +++ b/arch/x86/include/asm/kmsan.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KMSAN support. + * + * Copyright (C) 2022, Google LLC + * Author: Alexander Potapenko + */ + +#ifndef _ASM_X86_KMSAN_H +#define _ASM_X86_KMSAN_H + +#ifndef MODULE + +#include +#include + +/* + * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. + */ +static inline bool kmsan_phys_addr_valid(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return !(addr >> boot_cpu_data.x86_phys_bits); + else + return true; +} + +/* + * Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version. + */ +static inline bool kmsan_virt_addr_valid(void *addr) +{ + unsigned long x = (unsigned long)addr; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !kmsan_phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} + +#endif /* !MODULE */ + +#endif /* _ASM_X86_KMSAN_H */ From 9aa056dbfa5db3683de433f0716705974c66718a Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 22:22:36 +0800 Subject: [PATCH 247/352] mm/damon/sysfs: avoid call damon_target_has_pid() repeatedly In damon_sysfs_destroy_targets(), we call damon_target_has_pid() to check whether the 'ctx' include a valid pid, but there no need to call damon_target_has_pid() to check repeatedly, just need call it once. Link: https://lkml.kernel.org/r/20220915142237.92529-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index d947e27be74bc..5004269a0cdfb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2143,9 +2143,13 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next; + bool has_pid = false; + + if (damon_target_has_pid(ctx)) + has_pid = true; damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) + if (has_pid) put_pid(t->pid); damon_destroy_target(t); } From 63ff814161f125151a3a509c21557e8cae0f3343 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 16 Sep 2022 21:35:35 +0800 Subject: [PATCH 248/352] mm/damon/sysfs: more simplified code calls damon_target_has_pid() This patch further simplifies the damon_target_has_pid() call and makes the damon_sysfs_destroy_targets() look cleaner. Link: https://lkml.kernel.org/r/20220916133535.7428-1-xhao@linux.alibaba.com Fixes: 49fb890735ab (mm/damon/sysfs: avoid call damon_target_has_pid() repeatedly) Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 5004269a0cdfb..cc22c155f3583 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2143,10 +2143,7 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next; - bool has_pid = false; - - if (damon_target_has_pid(ctx)) - has_pid = true; + bool has_pid = damon_target_has_pid(ctx); damon_for_each_target_safe(t, next, ctx) { if (has_pid) From ed1f187b0c5ed03fa21a865185219d9b2575a8fe Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 13:30:41 +0000 Subject: [PATCH 249/352] mm/damon: simplify scheme create in lru_sort.c In damon_lru_sort_new_hot_scheme() and damon_lru_sort_new_cold_scheme(), they have so much in common, so we can combine them into a single function, and we just need to distinguish their differences. Link: https://lkml.kernel.org/r/20220915133041.71819-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Suggested-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 07a0908963fd0..96300ceb65ebb 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -132,6 +132,18 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, lru_sort_tried_cold_regions, lru_sorted_cold_regions, cold_quota_exceeds); +struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; + static struct damon_ctx *ctx; static struct damon_target *target; @@ -157,36 +169,19 @@ static struct damos *damon_lru_sort_new_scheme( /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and accessed for more than the threshold */ - .min_nr_accesses = hot_thres, - .max_nr_accesses = UINT_MAX, - /* no matter its age */ - .min_age_region = 0, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.min_nr_accesses = hot_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and not accessed at all */ - .min_nr_accesses = 0, - .max_nr_accesses = 0, - /* for min_age or more micro-seconds */ - .min_age_region = cold_thres, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } From 0f5e614674317094df19bcb9bd3c964d5596dc07 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 17 Sep 2022 20:12:28 +0800 Subject: [PATCH 250/352] mm/damon: change damon_lru_sort_stub_pattern to static damon_lru_sort_stub_pattern is only used in lru_sort.c now, change it to static. Link: https://lkml.kernel.org/r/20220917121228.1889699-1-yangyingliang@huawei.com Fixes: 27812ec2a990 ("mm/damon: simplify scheme create in lru_sort.c") Signed-off-by: Yang Yingliang Cc: SeongJae Park Cc: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 96300ceb65ebb..a91c1e364fc7b 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -132,7 +132,7 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, lru_sort_tried_cold_regions, lru_sorted_cold_regions, cold_quota_exceeds); -struct damos_access_pattern damon_lru_sort_stub_pattern = { +static struct damos_access_pattern damon_lru_sort_stub_pattern = { /* Find regions having PAGE_SIZE or larger size */ .min_sz_region = PAGE_SIZE, .max_sz_region = ULONG_MAX, From 85d11245b0ede17f3acbc5517bb3e189968e0ac9 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 15 Sep 2022 19:33:41 +0800 Subject: [PATCH 251/352] mm/damon: use 'struct damon_target *' instead of 'void *' in target_valid() We could use 'struct damon_target *' directly instead of 'void *' in target_valid() operation to make code simple. Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/vaddr.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c5dc0c77c7722..1dda8d0068e54 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -346,7 +346,7 @@ struct damon_operations { unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - bool (*target_valid)(void *target); + bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3f84584f99826..f53c2ff2bcc8a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -593,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -static bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(struct damon_target *t) { - struct damon_target *t = target; struct task_struct *task; task = damon_get_task_struct(t); From 4e0d3156497c17a969eef6785cbcf07a6c9bcdb0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:56 +0100 Subject: [PATCH 252/352] mm: add PSI accounting around ->read_folio and ->readahead calls Patch series "improve pagecache PSI annotations", v2. Currently the VM tries to abuse the block layer submission path for the page cache PSI annotations. This series instead annotates the ->read_folio and ->readahead calls in the core VM code, and then only deals with the odd direct add_to_page_cache_lru calls manually. This patch (of 5): PSI tries to account for the cost of bringing back in pages discarded by the MM LRU management. Currently the prime place for that is hooked into the bio submission path, which is a rather bad place: - it does not actually account I/O for non-block file systems, of which we have many - it adds overhead and a layering violation to the block layer Add the accounting into the two places in the core MM code that read pages into an address space by calling into ->read_folio and ->readahead so that the entire file system operations are covered, to broaden the coverage and allow removing the accounting in the block layer going forward. As psi_memstall_enter can deal with nested calls this will not lead to double accounting even while the bio annotations are still present. Link: https://lkml.kernel.org/r/20220915094200.139713-1-hch@lst.de Link: https://lkml.kernel.org/r/20220915094200.139713-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Cc: Chao Yu Cc: Chris Mason Cc: David Sterba Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Matthew Wilcox Cc: Suren Baghdasaryan Cc: Gao Xiang Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ mm/filemap.c | 7 +++++++ mm/readahead.c | 22 ++++++++++++++++++---- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 23125ab87ded2..bbccb40442224 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1169,6 +1169,8 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool _workingset; + unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ diff --git a/mm/filemap.c b/mm/filemap.c index ec17bd1a3bb77..08341616ae7a1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2390,6 +2390,8 @@ static void filemap_get_read_batch(struct address_space *mapping, static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { + bool workingset = folio_test_workingset(folio); + unsigned long pflags; int error; /* @@ -2398,8 +2400,13 @@ static int filemap_read_folio(struct file *file, filler_t filler, * fails. */ folio_clear_error(folio); + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); if (error) return error; diff --git a/mm/readahead.c b/mm/readahead.c index fdcd28cbd92de..b10f0cf81d804 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -122,6 +122,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac) if (!readahead_count(rac)) return; + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { @@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac) } blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; BUG_ON(readahead_count(rac)); } @@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages++; } @@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); - if (err) + if (err) { folio_put(folio); - else - ractl->_nr_pages += 1UL << order; - return err; + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; } void page_cache_ra_order(struct readahead_control *ractl, @@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl, put_page(page); return; } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } ractl->_nr_pages++; if (ra) { ra->size++; From ae29a4033307d6693959214e0f658d89a8ef4c01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:57 +0100 Subject: [PATCH 253/352] sched/psi: export psi_memstall_{enter,leave} To properly account for all refaults from file system logic, file systems need to call psi_memstall_enter directly, so export it. Link: https://lkml.kernel.org/r/20220915094200.139713-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Cc: Chao Yu Cc: Chris Mason Cc: David Sterba Cc: Gao Xiang Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/sched/psi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0a..7f6030091aeee 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -917,6 +917,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -946,6 +947,7 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) From 031c5f0ef34ae36cfe00f7515997a942a4ba2627 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:58 +0100 Subject: [PATCH 254/352] btrfs: add manual PSI accounting for compressed reads btrfs compressed reads try to always read the entire compressed chunk, even if only a subset is requested. Currently this is covered by the magic PSI accounting underneath submit_bio, but that is about to go away. Instead add manual psi_memstall_{enter,leave} annotations. Note that for readahead this really should be using readahead_expand, but the additionals reads are also done for plain ->read_folio where readahead_expand can't work, so this overall logic is left as-is for now. Link: https://lkml.kernel.org/r/20220915094200.139713-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: David Sterba Acked-by: Johannes Weiner Cc: Chao Yu Cc: Chris Mason Cc: Gao Xiang Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- fs/btrfs/compression.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 05d228efcd31a..8d4811561eba7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -518,7 +519,8 @@ static u64 bio_end_offset(struct bio *bio) */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, - struct compressed_bio *cb) + struct compressed_bio *cb, + unsigned long *pflags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; @@ -587,6 +589,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } + if (PageWorkingset(page)) + psi_memstall_enter(pflags); + ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -673,6 +678,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; + /* Initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; blk_status_t ret; int ret2; int i; @@ -728,7 +735,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail; } - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb, &pflags); /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; @@ -809,6 +816,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } } + if (!pflags) + psi_memstall_leave(&pflags); + if (refcount_dec_and_test(&cb->pending_ios)) finish_compressed_bio_read(cb); return; From f21e117d9a774e54b99956e071d6be341df9000f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:59 +0100 Subject: [PATCH 255/352] erofs: add manual PSI accounting for the compressed address space erofs uses an additional address space for compressed data read from disk in addition to the one directly associated with the inode. Reading into the lower address space is open coded using add_to_page_cache_lru instead of using the filemap.c helper for page allocation micro-optimizations, which means it is not covered by the MM PSI annotations for ->read_folio and ->readahead, so add manual ones instead. Link: https://lkml.kernel.org/r/20220915094200.139713-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Acked-by: Gao Xiang Cc: Chao Yu Cc: Chris Mason Cc: David Sterba Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- fs/erofs/zdata.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5792ca9e0d5ef..143a101a36887 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -7,6 +7,7 @@ #include "zdata.h" #include "compress.h" #include +#include #include @@ -1365,6 +1366,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + /* initialize to 1 to make skip psi_memstall_leave unless needed */ + unsigned long pflags = 1; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1414,10 +1417,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); bio = NULL; } + if (unlikely(PageWorkingset(page))) + psi_memstall_enter(&pflags); + if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); @@ -1445,8 +1453,11 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { + if (!pflags) + psi_memstall_leave(&pflags); submit_bio(bio); + } /* * although background is preferred, no one is pending for submission. From b33d9724c2da010330ed3d1b69ee59784439f19d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:42:00 +0100 Subject: [PATCH 256/352] block: remove PSI accounting from the bio layer PSI accounting is now done by the VM code, where it should have been since the beginning. Link: https://lkml.kernel.org/r/20220915094200.139713-6-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Cc: Chao Yu Cc: Chris Mason Cc: David Sterba Cc: Gao Xiang Cc: Gao Xiang Cc: Jens Axboe Cc: Josef Bacik Cc: Matthew Wilcox Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- block/bio.c | 8 -------- block/blk-core.c | 17 ----------------- fs/direct-io.c | 2 -- include/linux/blk_types.h | 1 - 4 files changed, 28 deletions(-) diff --git a/block/bio.c b/block/bio.c index 7d10476ceefd4..22ab8e3adbeb7 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1067,9 +1067,6 @@ void __bio_add_page(struct bio *bio, struct page *page, bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -1279,9 +1276,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. - * - * It's intended for direct IO, so doesn't do PSI tracking, the caller is - * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1297,8 +1291,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/block/blk-core.c b/block/blk-core.c index a0d1104c5590c..9e19195af6f5b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -829,22 +828,6 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - - psi_memstall_enter(&pflags); - submit_bio_noacct(bio); - psi_memstall_leave(&pflags); - return; - } - submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/fs/direct-io.c b/fs/direct-io.c index 05c044c55374e..1a2ddbe482264 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -421,8 +421,6 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags; bio->bi_private = dio; - /* don't account direct I/O as memory stall */ - bio_clear_flag(bio, BIO_WORKINGSET); spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ef99790f6ed3..8b1858df21752 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -321,7 +321,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ From 0a42f616f4637eac2d258d0a081f57129e3e2c6a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:23 +0800 Subject: [PATCH 257/352] mm/damon/reclaim: change damon_reclaim_wmarks to static damon_reclaim_wmarks is only used in reclaim.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-1-yangyingliang@huawei.com Fixes: 89dd02d8abd1 ("mm/damon/reclaim: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1acf808e16242..039fa55e0ae9c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -64,7 +64,7 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); -struct damos_watermarks damon_reclaim_wmarks = { +static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 500, /* 50 percent */ From 051ded92d0c4120406d6a89dd0c1f29d1c233995 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:24 +0800 Subject: [PATCH 258/352] mm/damon/lru_sort: change damon_lru_sort_wmarks to static damon_lru_sort_wmarks is only used in lru_sort.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-2-yangyingliang@huawei.com Fixes: 189aa3d58206 ("mm/damon/lru_sort: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a91c1e364fc7b..4a40054ba03bf 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -77,7 +77,7 @@ static struct damos_quota damon_lru_sort_quota = { }; DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); -struct damos_watermarks damon_lru_sort_wmarks = { +static struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 200, /* 20 percent */ From 23d673d8a438e2440053a1ea54638b6054808950 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 15 Sep 2022 09:16:02 +0800 Subject: [PATCH 259/352] mm/secretmem: add __init annotation to secretmem_init() It's a fs_initcall entry, add __init annotation to it. Link: https://lkml.kernel.org/r/20220915011602.176967-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 3f71540997958..6a44efb673b2c 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -276,7 +276,7 @@ static struct file_system_type secretmem_fs = { .kill_sb = kill_anon_super, }; -static int secretmem_init(void) +static int __init secretmem_init(void) { int ret = 0; From 3d62020a1e795471849508ba660fdfaf1fb92a4a Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 16 Sep 2022 23:20:35 +0800 Subject: [PATCH 260/352] mm/damon: return void from damon_set_schemes() There is no point in returning an int from damon_set_schemes(). It always returns 0 which is meaningless for the caller, so change it to return void directly. Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/core.c | 5 +---- mm/damon/dbgfs.c | 8 +++----- mm/damon/lru_sort.c | 4 +--- mm/damon/reclaim.c | 4 +--- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1dda8d0068e54..e7808a84675fb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); -int damon_set_schemes(struct damon_ctx *ctx, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); diff --git a/mm/damon/core.c b/mm/damon/core.c index a843673c11cfc..9c80c6eb00c24 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -454,10 +454,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) * * This function should not be called while the kdamond of the context is * running. - * - * Return: 0 if success, or negative error code otherwise. */ -int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes) { struct damos *s, *next; @@ -467,7 +465,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, damon_destroy_scheme(s); for (i = 0; i < nr_schemes; i++) damon_add_scheme(ctx, schemes[i]); - return 0; } /** diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index c00eba4448d85..6f0ae7d3ae39b 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -307,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, goto unlock_out; } - ret = damon_set_schemes(ctx, schemes, nr_schemes); - if (!ret) { - ret = count; - nr_schemes = 0; - } + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; unlock_out: mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 4a40054ba03bf..d7eb72b41cb67 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -203,9 +203,7 @@ static int damon_lru_sort_apply_parameters(void) scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 039fa55e0ae9c..3d59ab11b7b39 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -155,9 +155,7 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); if (monitor_region_start > monitor_region_end) return -EINVAL; From 643a67045cde547e00dd932660ee9e31f7b3ea24 Mon Sep 17 00:00:00 2001 From: Chih-En Lin Date: Fri, 16 Sep 2022 17:04:34 +0800 Subject: [PATCH 261/352] mm/page_table_check: fix typos Link: https://lkml.kernel.org/r/20220916090434.701194-1-shiyn.lin@gmail.com Signed-off-by: Chih-En Lin Acked-by: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 903db62794d3c..433dbce13fe1d 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -53,7 +53,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) } /* - * An enty is removed from the page table, decrement the counters for that page + * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, @@ -87,7 +87,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } /* - * A new enty is added to the page table, increment the counters for that page + * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ From 5702dada93f17108a580355e6ef4bafa2d932730 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:42 +0800 Subject: [PATCH 262/352] mm/page_alloc: ensure kswapd doesn't accidentally go to sleep Patch series "A few cleanup patches for mm", v2. This series contains a few cleanup patches to remove the obsolete comments and functions, use helper macro to improve readability and so on. More details can be found in the respective changelogs. This patch (of 16): If ALLOC_KSWAPD is set, wake_all_kswapds() will be called to ensure kswapd doesn't accidentally go to sleep. But when reserve_flags is set, alloc_flags will be overwritten and ALLOC_KSWAPD is thus lost. Preserve the ALLOC_KSWAPD flag in alloc_flags to ensure kswapd won't go to sleep accidentally. Link: https://lkml.kernel.org/r/20220916072257.9639-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-2-linmiaohe@huawei.com Fixes: 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef3f51c9815de..35e18514215ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5148,7 +5148,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); /* * Reset the nodemask and zonelist iterators if memory policies can be From 96b6d15acc031b0ea94d65e56d4e7fe60d1e0ff6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:43 +0800 Subject: [PATCH 263/352] mm/page_alloc: make zone_pcp_update() static Since commit b92ca18e8ca5 ("mm/page_alloc: disassociate the pcp->high from pcp->batch"), zone_pcp_update() is only used in mm/page_alloc.c. Move zone_pcp_update() up to avoid forward declaration and then make it static. No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/page_alloc.c | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 883b09f024044..94d8a976c2e25 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -367,7 +367,6 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35e18514215ac..04559870ed126 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7234,6 +7234,17 @@ void __meminit setup_zone_pageset(struct zone *zone) zone_set_pageset_high_and_batch(zone, 0); } +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. @@ -9465,17 +9476,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) } EXPORT_SYMBOL(free_contig_range); -/* - * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalculated. - */ -void zone_pcp_update(struct zone *zone, int cpu_online) -{ - mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone, cpu_online); - mutex_unlock(&pcp_batch_high_lock); -} - /* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about From 41e79918885623ffebe4c29811e1a316e9ebf7d5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:44 +0800 Subject: [PATCH 264/352] mm: remove obsolete macro NR_PCP_ORDER_MASK and NR_PCP_ORDER_WIDTH Since commit 8b10b465d0e1 ("mm/page_alloc: free pages in a single pass during bulk free"), they're not used anymore. Remove them. Link: https://lkml.kernel.org/r/20220916072257.9639-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 ------- mm/page_alloc.c | 1 - 2 files changed, 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c69c081568227..3ff1e757d5aab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -564,13 +564,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -/* - * Shift to encode migratetype and order in the same integer, with order - * in the least significant bits. - */ -#define NR_PCP_ORDER_WIDTH 8 -#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04559870ed126..bb2906829599b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,7 +1584,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; - BUILD_BUG_ON(MAX_ORDER >= (1< Date: Fri, 16 Sep 2022 15:22:45 +0800 Subject: [PATCH 265/352] mm/page_alloc: remove obsolete comment in zone_statistics() Since commit 43c95bcc51e4 ("mm/page_alloc: reduce duration that IRQs are disabled for VM counters"), zone_statistics() is not called with interrupts disabled. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb2906829599b..762619463bb57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3671,8 +3671,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* * Update NUMA hit/miss statistics - * - * Must be called with interrupts disabled. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) From 2df9b197bba28e21d6e4a14f12253b2a2b4eefcc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:46 +0800 Subject: [PATCH 266/352] mm/page_alloc: add __init annotations to init_mem_debugging_and_hardening() It's only called by mm_init(). Add __init annotations to it. Link: https://lkml.kernel.org/r/20220916072257.9639-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 364bcadb4d202..c2277f5aba9eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3093,7 +3093,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -extern void init_mem_debugging_and_hardening(void); +extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 762619463bb57..27ef763bb59f8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,7 +903,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * order of appearance. So we need to first gather the full picture of what was * enabled, and then make decisions. */ -void init_mem_debugging_and_hardening(void) +void __init init_mem_debugging_and_hardening(void) { bool page_poisoning_requested = false; From 5ca5a02aafde986fc94f6a0520fb6aa89924712b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:47 +0800 Subject: [PATCH 267/352] mm/page_alloc: fix freeing static percpu memory The size of struct per_cpu_zonestat can be 0 on !SMP && !NUMA. In that case, zone->per_cpu_zonestats will always equal to boot_zonestats. But in zone_pcp_reset(), zone->per_cpu_zonestats is freed via free_percpu() directly without checking against boot_zonestats first. boot_zonestats will be released by free_percpu() unexpectedly. Link: https://lkml.kernel.org/r/20220916072257.9639-7-linmiaohe@huawei.com Fixes: 28f836b6777b ("mm/page_alloc: split per cpu page lists and zone stats") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27ef763bb59f8..cad2357709481 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9505,9 +9505,11 @@ void zone_pcp_reset(struct zone *zone) drain_zonestat(zone, pzstats); } free_percpu(zone->per_cpu_pageset); - free_percpu(zone->per_cpu_zonestats); zone->per_cpu_pageset = &boot_pageset; - zone->per_cpu_zonestats = &boot_zonestats; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } } } From 6d159915e6e3ee031beade718e5abd241948ead8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:48 +0800 Subject: [PATCH 268/352] mm: remove obsolete pgdat_is_empty() There's no caller. Remove it. Link: https://lkml.kernel.org/r/20220916072257.9639-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ff1e757d5aab..4c8510f26b02b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1241,11 +1241,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) return pgdat->node_start_pfn + pgdat->node_spanned_pages; } -static inline bool pgdat_is_empty(pg_data_t *pgdat) -{ - return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; -} - #include void build_all_zonelists(pg_data_t *pgdat); From 5ea45d2571e3a1dba2984b7f826fb6f6cbc5828a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:49 +0800 Subject: [PATCH 269/352] mm/page_alloc: add missing is_migrate_isolate() check in set_page_guard() In MIGRATE_ISOLATE case, zone freepage state shouldn't be modified as caller will take care of it. Add missing is_migrate_isolate() here to avoid possible unbalanced freepage state. This would happen if someone isolates the block, and then we face an MCE failure/soft-offline on a page within that block. __mod_zone_freepage_state() will be triggered via below call trace which already had been triggered back when block was isolated: take_page_off_buddy break_down_buddy_pages set_page_guard Link: https://lkml.kernel.org/r/20220916072257.9639-9-linmiaohe@huawei.com Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") Signed-off-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cad2357709481..8fcc905ef317a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -873,7 +873,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << order), migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } From 76e93371863b22fc70a6c80228783b6cb0455d9c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:50 +0800 Subject: [PATCH 270/352] mm/page_alloc: use local variable zone_idx directly Use local variable zone_idx directly since it holds the exact value of zone_idx(). No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fcc905ef317a..83b2cb93d6fdc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6873,7 +6873,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return; /* From b8ae683f6e1e693a015b8386c0460abcd790951b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:51 +0800 Subject: [PATCH 271/352] mm, memory_hotplug: remove obsolete generic_free_nodedata() Commit 390511e1476e ("mm, memory_hotplug: drop arch_free_nodedata") drops the last caller of generic_free_nodedata(). Remove it too. Link: https://lkml.kernel.org/r/20220916072257.9639-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51052969dbfe4..9fcbf57065957 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -43,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); ({ \ memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) extern pg_data_t *node_data[]; static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) @@ -63,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid) BUG(); return NULL; } -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) { } From 90a721a34aff7f80e20444f5c867a0cdf33778d3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:52 +0800 Subject: [PATCH 272/352] mm/page_alloc: make boot_nodestats static It's only used in mm/page_alloc.c now. Make it static. Link: https://lkml.kernel.org/r/20220916072257.9639-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 2 -- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 94d8a976c2e25..b3002e03c28f0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -836,8 +836,6 @@ int migrate_device_coherent_page(struct page *page); */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); - extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83b2cb93d6fdc..6bdc98c7019fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6571,7 +6571,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { From 13d0f288f50991621d766e58f7c75f28ad157168 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:53 +0800 Subject: [PATCH 273/352] mm/page_alloc: use helper macro SZ_1{K,M} Use helper macro SZ_1K and SZ_1M to do the size conversion. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6bdc98c7019fb..67ec8a2e1db2b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7048,7 +7048,7 @@ static int zone_batchsize(struct zone *zone) * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -8523,8 +8523,8 @@ void __init mem_init_print_info(void) #endif ")\n", K(nr_free_pages()), K(physpages), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, K(physpages - totalram_pages() - totalcma_pages), K(totalcma_pages) #ifdef CONFIG_HIGHMEM @@ -9049,8 +9049,8 @@ void *__init alloc_large_system_hash(const char *tablename, numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ - if (PAGE_SHIFT < 20) - numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); #if __BITS_PER_LONG > 32 if (!high_limit) { From f6f96e23a77144a438d2bbd6eea865fd6a27584b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:54 +0800 Subject: [PATCH 274/352] mm/page_alloc: init local variable buddy_pfn The local variable buddy_pfn could be passed to buddy_merge_likely() without initialization if the passed in order is MAX_ORDER - 1. This looks buggy but buddy_pfn won't be used in this case as there's a order >= MAX_ORDER - 2 check. Init buddy_pfn to 0 anyway to avoid possible future misuse. Link: https://lkml.kernel.org/r/20220916072257.9639-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 67ec8a2e1db2b..652b3d7660035 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1113,7 +1113,7 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned long buddy_pfn; + unsigned long buddy_pfn = 0; unsigned long combined_pfn; struct page *buddy; bool to_tail; From a8984f98ac85a8d336e17e230d0f21572b201127 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:55 +0800 Subject: [PATCH 275/352] mm/page_alloc: use costly_order in WARN_ON_ONCE_GFP() There's no need to check whether order > PAGE_ALLOC_COSTLY_ORDER again. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-15-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 652b3d7660035..3d2ad5c197d56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5272,7 +5272,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); + WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* * Help non-failing allocations by giving them access to memory From e2f5056c150db3058b534cad0faea0ffecda1239 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:56 +0800 Subject: [PATCH 276/352] mm/page_alloc: remove obsolete gfpflags_normal_context() Since commit dacb5d8875cc ("tcp: fix page frag corruption on page fault"), there's no caller of gfpflags_normal_context(). Remove it as this helper is strictly tied to the sk page frag usage and there won't be other user in the future. Link: https://lkml.kernel.org/r/20220916072257.9639-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/gfp.h | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ea6cb9399152e..ef4aea3b356e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } -/** - * gfpflags_normal_context - is gfp_flags a normal sleepable context? - * @gfp_flags: gfp_flags to test - * - * Test whether @gfp_flags indicates that the allocation is from the - * %current context and allowed to sleep. - * - * An allocation being allowed to block doesn't mean it owns the %current - * context. When direct reclaim path tries to allocate memory, the - * allocation context is nested inside whatever %current was doing at the - * time of the original allocation. The nested allocation may be allowed - * to block but modifying anything %current owns can corrupt the outer - * context's expectations. - * - * %true result from this function indicates that the allocation context - * can sleep and use anything that's associated with %current. - */ -static inline bool gfpflags_normal_context(const gfp_t gfp_flags) -{ - return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == - __GFP_DIRECT_RECLAIM; -} - #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else From d335288640111ab8e59c9161c26aea806355fdee Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 19 Sep 2022 17:30:51 +0800 Subject: [PATCH 277/352] mm-page_alloc-remove-obsolete-gfpflags_normal_context-fix fix htmldocs Link: https://lkml.kernel.org/r/1bc55727-9b66-0e9e-c306-f10c4716ea89@huawei.com Reported-by: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 1ebcc6c3fafe7..f5dde5bceaeaf 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -19,9 +19,6 @@ User Space Memory Access Memory Allocation Controls ========================== -.. kernel-doc:: include/linux/gfp.h - :internal: - .. kernel-doc:: include/linux/gfp_types.h :doc: Page mobility and placement hints From 20e08a1e33dbc6aa592ad31343aa8a2319cd807d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:57 +0800 Subject: [PATCH 278/352] mm/page_alloc: fix obsolete comment in deferred_pfn_valid() There are no architectures that can have holes in the memory map within a pageblock since commit 859a85ddf90e ("mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE"). Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-17-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3d2ad5c197d56..d7b20bf09c1c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1929,11 +1929,7 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. - * - * Then, we check if a current large page is valid by only checking the validity + * We check if a current large page is valid by only checking the validity * of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) From 24b35bb9a673e1738138ce44bf7afbee66138615 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Sep 2022 14:46:38 -0700 Subject: [PATCH 279/352] hugetlb: freeze allocated pages before creating hugetlb pages When creating hugetlb pages, the hugetlb code must first allocate contiguous pages from a low level allocator such as buddy, cma or memblock. The pages returned from these low level allocators are ref counted. This creates potential issues with other code taking speculative references on these pages before they can be transformed to a hugetlb page. This issue has been addressed with methods and code such as that provided in [1]. Recent discussions about vmemmap freeing [2] have indicated that it would be beneficial to freeze all sub pages, including the head page of pages returned from low level allocators before converting to a hugetlb page. This helps avoid races if we want to replace the page containing vmemmap for the head page. There have been proposals to change at least the buddy allocator to return frozen pages as described at [3]. If such a change is made, it can be employed by the hugetlb code. However, as mentioned above hugetlb uses several low level allocators so each would need to be modified to return frozen pages. For now, we can manually freeze the returned pages. This is done in two places: 1) alloc_buddy_huge_page, only the returned head page is ref counted. We freeze the head page, retrying once in the VERY rare case where there may be an inflated ref count. 2) prep_compound_gigantic_page, for gigantic pages the current code freezes all pages except the head page. New code will simply freeze the head page as well. In a few other places, code checks for inflated ref counts on newly allocated hugetlb pages. With the modifications to freeze after allocating, this code can be removed. After hugetlb pages are freshly allocated, they are often added to the hugetlb free lists. Since these pages were previously ref counted, this was done via put_page() which would end up calling the hugetlb destructor: free_huge_page. With changes to freeze pages, we simply call free_huge_page directly to add the pages to the free list. In a few other places, freshly allocated hugetlb pages were immediately put into use, and the expectation was they were already ref counted. In these cases, we must manually ref count the page. [1] https://lore.kernel.org/linux-mm/20210622021423.154662-3-mike.kravetz@oracle.com/ [2] https://lore.kernel.org/linux-mm/20220802180309.19340-1-joao.m.martins@oracle.com/ [3] https://lore.kernel.org/linux-mm/20220809171854.3725722-1-willy@infradead.org/ Link: https://lkml.kernel.org/r/20220916214638.155744-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Cc: Joao Martins Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 102 +++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 652d22270db3c..0c3c2c5ef994b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,9 +1787,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { p = nth_page(page, i); /* @@ -1830,17 +1829,19 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_compound_head(p, page); + if (i != 0) + set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); atomic_set(compound_pincount_ptr(page), 0); return true; out_error: - /* undo tail page modifications made above */ - for (j = 1; j < i; j++) { + /* undo page modifications made above */ + for (j = 0; j < i; j++) { p = nth_page(page, j); - clear_compound_head(p); + if (j != 0) + clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ @@ -1936,6 +1937,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; + bool retry = true; /* * By default we always try hard to allocate the page with @@ -1951,7 +1953,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); +retry: page = __alloc_pages(gfp_mask, order, nid, nmask); + + /* Freeze head page */ + if (!page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } + if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1979,6 +1995,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ static struct page *alloc_fresh_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, @@ -2036,7 +2055,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, if (!page) return 0; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ return 1; } @@ -2193,10 +2212,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask, bool zero_ref) + int nid, nodemask_t *nmask) { struct page *page = NULL; - bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2206,7 +2224,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); -retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2222,34 +2239,10 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); spin_unlock_irq(&hugetlb_lock); - put_page(page); + free_huge_page(page); return NULL; } - if (zero_ref) { - /* - * Caller requires a page with zero ref count. - * We will drop ref count here. If someone else is holding - * a ref, the page will be freed when they drop it. Abuse - * temporary page flag to accomplish this. - */ - SetHPageTemporary(page); - if (!put_page_testzero(page)) { - /* - * Unexpected inflated ref count on freshly allocated - * huge. Retry once. - */ - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - spin_unlock_irq(&hugetlb_lock); - if (retry) - return NULL; - - retry = true; - goto retry; - } - ClearHPageTemporary(page); - } - h->surplus_huge_pages++; h->surplus_huge_pages_node[page_to_nid(page)]++; @@ -2271,6 +2264,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (!page) return NULL; + /* fresh huge pages are frozen */ + set_page_refcounted(page); + /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference @@ -2298,14 +2294,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } @@ -2375,7 +2371,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL, true); + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; @@ -2737,7 +2733,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); - bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2748,30 +2743,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ -alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; - /* - * If all goes well, this page will be directly added to the free - * list in the pool. For this the ref count needs to be zero. - * Attempt to drop now, and retry once if needed. It is VERY - * unlikely there is another ref on the page. - * - * If someone else has a reference to the page, it will be freed - * when they drop their ref. Abuse temporary page flag to accomplish - * this. Retry once if there is an inflated ref count. - */ - SetHPageTemporary(new_page); - if (!put_page_testzero(new_page)) { - if (alloc_retry) - return -EBUSY; - - alloc_retry = true; - goto alloc_retry; - } - ClearHPageTemporary(new_page); - __prep_new_huge_page(h, new_page); retry: @@ -2951,6 +2925,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); @@ -3055,7 +3030,7 @@ static void __init gather_bootmem_prealloc(void) if (prep_compound_gigantic_page(page, huge_page_order(h))) { WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* add to the hugepage allocator */ + free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); @@ -3087,7 +3062,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) &node_states[N_MEMORY], NULL); if (!page) break; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3478,9 +3453,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) else prep_compound_page(subpage, target_hstate->order); set_page_private(subpage, 0); - set_page_refcounted(subpage); prep_new_huge_page(target_hstate, subpage, nid); - put_page(subpage); + free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); From b526ab775fd6c6cc4e8b8e5289c41fca094d65b3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 21 Sep 2022 13:27:02 -0700 Subject: [PATCH 280/352] hugetlb-freeze-allocated-pages-before-creating-hugetlb-pages-v3 fix NULL pointer dereference in alloc_buddy_huge_page caused by not checking for page before attempting to freeze. Thanks Naoya. Link: https://lkml.kernel.org/r/20220921202702.106069-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Cc: Joao Martins Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c3c2c5ef994b..bc68bb67e821a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1957,7 +1957,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, page = __alloc_pages(gfp_mask, order, nid, nmask); /* Freeze head page */ - if (!page_ref_freeze(page, 1)) { + if (page && !page_ref_freeze(page, 1)) { __free_pages(page, order); if (retry) { /* retry once */ retry = false; From 85447804db309a74590c3bc51744706431c1bdb7 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 17 Sep 2022 21:56:54 +0800 Subject: [PATCH 281/352] mm/damon: rename damon_pageout_score() to damon_cold_score() In the beginning there is only one damos_action 'DAMOS_PAGEOUT' that need to get the coldness score of a region for a scheme, which using damon_pageout_score() to do that. But now there are also other damos_action actions need the coldness score, so rename it to damon_cold_score() to make more sense. Link: https://lkml.kernel.org/r/1663423014-28907-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/ops-common.h | 2 +- mm/damon/paddr.c | 4 ++-- mm/damon/vaddr.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 9310df72e1c54..75409601f9349 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,7 +130,7 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, return hotness; } -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { int hotness = damon_hot_score(c, r, s); diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 52329ff361cd0..8d82d37222042 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dfeebffe82f44..e1a4315c4be6a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -287,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f53c2ff2bcc8a..ea94e0b2c3113 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -673,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } From 0b75f78557eb3d3b0d51771219e372dbc94b3a6d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sun, 18 Sep 2022 19:13:48 -0700 Subject: [PATCH 282/352] hugetlb: simplify hugetlb handling in follow_page_mask During discussions of this series [1], it was suggested that hugetlb handling code in follow_page_mask could be simplified. At the beginning of follow_page_mask, there currently is a call to follow_huge_addr which 'may' handle hugetlb pages. ia64 is the only architecture which provides a follow_huge_addr routine that does not return error. Instead, at each level of the page table a check is made for a hugetlb entry. If a hugetlb entry is found, a call to a routine associated with that entry is made. Currently, there are two checks for hugetlb entries at each page table level. The first check is of the form: if (p?d_huge()) page = follow_huge_p?d(); the second check is of the form: if (is_hugepd()) page = follow_huge_pd(). We can replace these checks, as well as the special handling routines such as follow_huge_p?d() and follow_huge_pd() with a single routine to handle hugetlb vmas. A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the beginning of follow_page_mask. hugetlb_follow_page_mask will use the existing routine huge_pte_offset to walk page tables looking for hugetlb entries. huge_pte_offset can be overwritten by architectures, and already handles special cases such as hugepd entries. [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/ Link: https://lkml.kernel.org/r/20220919021348.22151-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Michael Ellerman Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- arch/ia64/mm/hugetlbpage.c | 15 --- arch/powerpc/mm/hugetlbpage.c | 37 ------- include/linux/hugetlb.h | 50 ++-------- mm/gup.c | 80 +++------------ mm/hugetlb.c | 182 ++++++++++++---------------------- 5 files changed, 86 insertions(+), 278 deletions(-) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index f993cb36c0626..380d2f3966c98 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file, return 0; } -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write) -{ - struct page *page; - pte_t *ptep; - - if (REGION_NUMBER(addr) != RGN_HPAGE) - return ERR_PTR(-EINVAL); - - ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); - if (!ptep || pte_none(*ptep)) - return NULL; - page = pte_page(*ptep); - page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); - return page; -} int pmd_huge(pmd_t pmd) { return 0; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index bc84a594ca62c..b0e037c75c122 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift) -{ - pte_t *ptep; - spinlock_t *ptl; - struct page *page = NULL; - unsigned long mask; - int shift = hugepd_shift(hpd); - struct mm_struct *mm = vma->vm_mm; - -retry: - /* - * hugepage directory entries are protected by mm->page_table_lock - * Use this instead of huge_pte_lockptr - */ - ptl = &mm->page_table_lock; - spin_lock(ptl); - - ptep = hugepte_offset(hpd, address, pdshift); - if (pte_present(*ptep)) { - mask = (1UL << shift) - 1; - page = pte_page(*ptep); - page += ((address & mask) >> PAGE_SHIFT); - if (flags & FOLL_GET) - get_page(page); - } else { - if (is_hugetlb_entry_migration(*ptep)) { - spin_unlock(ptl); - __migration_entry_wait(mm, ptep, ptl); - goto retry; - } - } - spin_unlock(ptl); - return page; -} - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cfe15b32e2d43..32d45e96a8946 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -209,17 +211,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift); -struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, - int flags); -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags); -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, - pgd_t *pgd, int flags); void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); @@ -272,6 +263,12 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ +} + static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, @@ -282,12 +279,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm, return 0; } -static inline struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, @@ -320,31 +311,6 @@ static inline void hugetlb_show_meminfo_node(int nid) { } -static inline struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, - int pdshift) -{ - return NULL; -} - -static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, - unsigned long address, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pud(struct mm_struct *mm, - unsigned long address, pud_t *pud, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pgd(struct mm_struct *mm, - unsigned long address, pgd_t *pgd, int flags) -{ - return NULL; -} - static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { diff --git a/mm/gup.c b/mm/gup.c index bfded683e64e0..b580b5510a60a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -537,18 +537,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - - /* - * Considering PTE level hugetlb, like continuous-PTE hugetlb on - * ARM64 architecture. - */ - if (is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -680,20 +668,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pmd_val(pmdval)))) { - page = follow_huge_pd(vma, address, - __hugepd(pmd_val(pmdval)), flags, - PMD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } retry: if (!pmd_present(pmdval)) { /* @@ -783,20 +757,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pud_val(*pud)))) { - page = follow_huge_pd(vma, address, - __hugepd(pud_val(*pud)), flags, - PUD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); @@ -816,7 +776,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { p4d_t *p4d; - struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -825,14 +784,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); - if (is_hugepd(__hugepd(p4d_val(*p4d)))) { - page = follow_huge_pd(vma, address, - __hugepd(p4d_val(*p4d)), flags, - P4D_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } return follow_pud_mask(vma, address, p4d, flags, ctx); } @@ -870,10 +821,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, ctx->page_mask = 0; - /* make this handle hugepd */ - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + /* + * Call hugetlb_follow_page_mask for hugetlb vmas as it will use + * special hugetlb page table walking code. This eliminates the + * need to check for hugetlb entries in the general walking code. + * + * hugetlb_follow_page_mask is only for follow_page() handling here. + * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. + */ + if (is_vm_hugetlb_page(vma)) { + page = hugetlb_follow_page_mask(vma, address, flags); + if (!page) + page = no_page_table(vma, flags); return page; } @@ -882,21 +841,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - if (pgd_huge(*pgd)) { - page = follow_huge_pgd(mm, address, pgd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pgd_val(*pgd)))) { - page = follow_huge_pd(vma, address, - __hugepd(pgd_val(*pgd)), flags, - PGDIR_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } - return follow_p4d_mask(vma, address, pgd, flags, ctx); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc68bb67e821a..8613db3f754b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6130,6 +6130,72 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; } +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & huge_page_mask(h); + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte, entry; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + /* + * vma lock prevents racing with another thread doing a pmd unshare. + * This keeps pte as returned by huge_pte_offset valid. + */ + hugetlb_vma_lock_read(vma); + + pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (!pte) { + hugetlb_vma_unlock_read(vma); + return NULL; + } + + ptl = huge_pte_lock(h, mm, pte); + entry = huge_ptep_get(pte); + if (pte_present(entry)) { + page = pte_page(entry) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * Note that page may be a sub-page, and with vmemmap + * optimizations the page struct may be read only. + * try_grab_page() will increase the ref count on the + * head page, so this will be OK. + * + * try_grab_page() should always succeed here, because we hold + * the ptl lock and have verified pte_present(). + */ + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(entry)) { + spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); + __migration_entry_wait_huge(pte, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + hugetlb_vma_unlock_read(vma); + return page; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -7120,122 +7186,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -struct page * __weak -follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -struct page * __weak -follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, int pdshift) -{ - WARN(1, "hugepd follow called with no support for hugepage directory format\n"); - return NULL; -} - -struct page * __weak -follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; - spinlock_t *ptl; - pte_t *ptep, pte; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptep = huge_pte_offset(mm, address, huge_page_size(h)); - if (!ptep) - return NULL; - - ptl = huge_pte_lock(h, mm, ptep); - pte = huge_ptep_get(ptep); - if (pte_present(pte)) { - page = pte_page(pte) + - ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); - /* - * try_grab_page() should always succeed here, because: a) we - * hold the pmd (ptl) lock, and b) we've just checked that the - * huge pmd (head) page is present in the page tables. The ptl - * prevents the head page and tail pages from being rearranged - * in any way. So this page must be available at this point, - * unless the page refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait_huge(ptep, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); - if (!pud_huge(*pud)) - goto out; - pte = huge_ptep_get((pte_t *)pud); - if (pte_present(pte)) { - page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pud, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) -{ - if (flags & (FOLL_GET | FOLL_PIN)) - return NULL; - - return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); -} - int isolate_hugetlb(struct page *page, struct list_head *list) { int ret = 0; From f41e2fa68b9f2813084cddd528b47e91d3b9b389 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Sun, 18 Sep 2022 02:56:40 +0000 Subject: [PATCH 283/352] mm/page_alloc: update comments for rmqueue() Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists"), the per-cpu page allocators (PCP) is not only for order-0 pages. Update the comments. Link: https://lkml.kernel.org/r/20220918025640.208586-1-ran.xiaokai@zte.com.cn Signed-off-by: Ran Xiaokai Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d7b20bf09c1c1..34b1009744411 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3810,7 +3810,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, } /* - * Allocate a page from the given zone. Use pcplists for order-0 allocations. + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. */ /* From 1b33de5fdb9fe9170759ba097bcdeb53471410f5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Sep 2022 18:13:56 +0900 Subject: [PATCH 284/352] mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage Patch series " mm, hwpoison: improve handling workload related to hugetlb and memory_hotplug", v3. This patchset tries to solve the issue among memory_hotplug, hugetlb and hwpoison. Based on the review over v2 by Miaohe (thank you!), 1/4 takes another approach to prevent hwpoisoned hugepages to be migrated (i.e. the corrupted data is accessed) in memory hotremove. In this patchset, memory hotplug handles hwpoison pages like below: - hwpoison pages should not prevent memory hotremove, - memory block with hwpoison pages should not be onlined. This patch (of 4): HWPoisoned page is not supposed to be accessed once marked, but currently such accesses can happen during memory hotremove because do_migrate_range() can be called before dissolve_free_huge_pages() is called. Clear HPageMigratable for hwpoisoned hugepages to prevent them from being migrated. This should be done in hugetlb_lock to avoid race against isolate_hugetlb(). get_hwpoison_huge_page() needs to have a flag to show it's called from unpoison to take refcount of hwpoisoned hugepages, so add it. Link: https://lkml.kernel.org/r/20220921091359.25889-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20220921091359.25889-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 4 ++-- mm/memory-failure.c | 12 ++++++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 32d45e96a8946..2f4d7ad6b470c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -183,7 +183,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); @@ -391,7 +391,7 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8613db3f754b0..3b9d03c1970c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7204,7 +7204,7 @@ int isolate_hugetlb(struct page *page, struct list_head *list) return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7214,7 +7214,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145bb561ddb3a..5942e1c0407e1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1244,7 +1244,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, false); if (hugetlb) return ret; @@ -1334,7 +1334,7 @@ static int __get_unpoison_page(struct page *page) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, true); if (hugetlb) return ret; @@ -1815,6 +1815,13 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } + /* + * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * from being migrated by memory hotremove. + */ + if (count_increased) + ClearHPageMigratable(head); + return ret; out: if (count_increased) @@ -1862,6 +1869,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); + SetHPageMigratable(head); unlock_page(head); if (res == 1) put_page(head); From aa3e1ec495ab2fc3f22fc9240ad53a23b21fe0a1 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Sep 2022 18:13:57 +0900 Subject: [PATCH 285/352] mm/hwpoison: move definitions of num_poisoned_pages_* to memory-failure.c These interfaces will be used by drivers/base/core.c by later patch, so as a preparatory work move them to more common header file visible to the file. Link: https://lkml.kernel.org/r/20220921091359.25889-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Cc: David Hildenbrand Cc: Jane Chu Cc: Miaohe Lin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 3 +-- include/linux/mm.h | 5 +++++ include/linux/swapops.h | 24 ++---------------------- mm/memory-failure.c | 10 ++++++++++ 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index e391b175f5ece..fdc880e2575a5 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index c2277f5aba9eb..80a2d800f272f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,11 +3279,16 @@ extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +extern void num_poisoned_pages_inc(void); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) { return 0; } + +static inline void num_poisoned_pages_inc(void) +{ +} #endif #ifndef arch_memory_failure diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95ccb81bb7..3ba9bf56899da 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) #ifdef CONFIG_MEMORY_FAILURE -extern atomic_long_t num_poisoned_pages __read_mostly; - /* * Support for hardware poisoned pages */ @@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline void num_poisoned_pages_inc(void) -{ - atomic_long_inc(&num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long i) -{ - atomic_long_sub(i, &num_poisoned_pages); -} - -#else /* CONFIG_MEMORY_FAILURE */ +#else static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } - -static inline void num_poisoned_pages_inc(void) -{ -} - -static inline void num_poisoned_pages_sub(long i) -{ -} -#endif /* CONFIG_MEMORY_FAILURE */ +#endif static inline int non_swap_entry(swp_entry_t entry) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5942e1c0407e1..aa6ce685b8636 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,6 +74,16 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +static inline void num_poisoned_pages_inc(void) +{ + atomic_long_inc(&num_poisoned_pages); +} + +static inline void num_poisoned_pages_sub(long i) +{ + atomic_long_sub(i, &num_poisoned_pages); +} + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, From e0b23b151b7c996212783b79009f9faf685edb35 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Sep 2022 18:13:58 +0900 Subject: [PATCH 286/352] mm/hwpoison: pass pfn to num_poisoned_pages_*() No functional change. Link: https://lkml.kernel.org/r/20220921091359.25889-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 2 +- include/linux/mm.h | 4 ++-- mm/memory-failure.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index fdc880e2575a5..80943a00e2459 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -231,7 +231,7 @@ void __init pdc_pdt_init(void) /* mark memory page bad */ memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(addr >> PAGE_SHIFT); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index 80a2d800f272f..2bb5d1596041e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,14 +3279,14 @@ extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); -extern void num_poisoned_pages_inc(void); +extern void num_poisoned_pages_inc(unsigned long pfn); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) { return 0; } -static inline void num_poisoned_pages_inc(void) +static inline void num_poisoned_pages_inc(unsigned long pfn) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index aa6ce685b8636..a069d43bc87f2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,12 +74,12 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -static inline void num_poisoned_pages_inc(void) +static inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); } -static inline void num_poisoned_pages_sub(long i) +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); } @@ -125,7 +125,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (release) put_page(page); page_ref_inc(page); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); return true; } @@ -1194,7 +1194,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(pfn); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -1741,7 +1741,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) llist_add(&raw_hwp->node, head); /* the first error event will be counted in action_result(). */ if (ret) - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -2414,7 +2414,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(count); + num_poisoned_pages_sub(pfn, count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } From f1e8a99e7b5104f2992517f2645c258c36d3b152 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 21 Sep 2022 18:13:59 +0900 Subject: [PATCH 287/352] mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory section is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlined memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So it's desirable to implement hwpoison counter on this struct. Note that clear_hwpoisoned_pages() is relocated to be called earlier than now, just before unregistering struct memory_block. Otherwise, the per-memory_block hwpoison counter is freed and we fail to adjust global hwpoison counter properly. Link: https://lkml.kernel.org/r/20220921091359.25889-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Cc: David Hildenbrand Cc: Jane Chu Cc: Miaohe Lin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/memory.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/memory.h | 3 +++ include/linux/mm.h | 8 ++++++++ mm/internal.h | 8 -------- mm/memory-failure.c | 31 ++++++++++--------------------- mm/sparse.c | 2 -- 6 files changed, 57 insertions(+), 31 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9aa0da991cfb9..c9bde4c4ffdfe 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -183,6 +183,9 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; + if (atomic_long_read(&mem->nr_hwpoison)) + return -EHWPOISON; + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, start_pfn, nr_pages); @@ -864,6 +867,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; + clear_hwpoisoned_pages(atomic_long_read(&mem->nr_hwpoison)); unregister_memory_block_under_nodes(mem); remove_memory_block(mem); } @@ -1164,3 +1168,35 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, } return ret; } + +#ifdef CONFIG_MEMORY_FAILURE + +void memblk_nr_poison_inc(unsigned long pfn) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_inc(&mem->nr_hwpoison); +} + +void memblk_nr_poison_sub(unsigned long pfn, long i) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_sub(i, &mem->nr_hwpoison); +} + +unsigned long memblk_nr_poison(unsigned long pfn) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + return atomic_long_read(&mem->nr_hwpoison); + return 0; +} + +#endif diff --git a/include/linux/memory.h b/include/linux/memory.h index aa619464a1df0..74e6b3ad947f6 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -85,6 +85,9 @@ struct memory_block { unsigned long nr_vmemmap_pages; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ +#ifdef CONFIG_MEMORY_FAILURE + atomic_long_t nr_hwpoison; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2bb5d1596041e..2fe42bb9a517a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3280,6 +3280,10 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); extern void num_poisoned_pages_inc(unsigned long pfn); +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +extern unsigned long memblk_nr_poison(unsigned long pfn); +extern void clear_hwpoisoned_pages(long nr_poison); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) { @@ -3289,6 +3293,10 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) static inline void num_poisoned_pages_inc(unsigned long pfn) { } + +static inline void clear_hwpoisoned_pages(long nr_poison) +{ +} #endif #ifndef arch_memory_failure diff --git a/mm/internal.h b/mm/internal.h index b3002e03c28f0..42ba8b96cab5b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -#ifdef CONFIG_MEMORY_FAILURE -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a069d43bc87f2..03479895086df 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,14 +74,17 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -static inline void num_poisoned_pages_inc(unsigned long pfn) +void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); } /* @@ -2414,6 +2417,10 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { + /* + * TODO: per-memory_block counter might break when the page + * size to be unpoisoned is larger than a memory_block. + */ num_poisoned_pages_sub(pfn, count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); @@ -2618,25 +2625,7 @@ int soft_offline_page(unsigned long pfn, int flags) return ret; } -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +void clear_hwpoisoned_pages(long nr_poison) { - int i, total = 0; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - total++; - ClearPageHWPoison(&memmap[i]); - } - } - if (total) - num_poisoned_pages_sub(total); + num_poisoned_pages_sub(-1UL, nr_poison); } diff --git a/mm/sparse.c b/mm/sparse.c index e5a8a3a0edd74..2779b419ef2a2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, - nr_pages - map_offset); section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From 52140a8f94217a08d3cd098be35e1cb7bcbd2741 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 23 Sep 2022 17:18:27 +0900 Subject: [PATCH 288/352] mm-hwpoison-introduce-per-memory_block-hwpoison-counter-fix > > All errors (new ones prefixed by >>): > > drivers/base/memory.c: In function 'memory_block_online': > >> drivers/base/memory.c:186:34: error: 'struct memory_block' has no member named 'nr_hwpoison' > 186 | if (atomic_long_read(&mem->nr_hwpoison)) > | ^~ > drivers/base/memory.c: In function 'remove_memory_block_devices': > drivers/base/memory.c:870:61: error: 'struct memory_block' has no member named 'nr_hwpoison' > 870 | clear_hwpoisoned_pages(atomic_long_read(&mem->nr_hwpoison)); > | ^~ > I should've used the accessor memblk_nr_poison() to acccess to ->nr_hwpoison. Link: https://lkml.kernel.org/r/20220923081827.GA1357512@ik1-406-35019.vs.sakura.ne.jp Reported-by: kernel test robot Signed-off-by: Andrew Morton --- drivers/base/memory.c | 9 +++++++-- include/linux/mm.h | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c9bde4c4ffdfe..f470bbfc68d02 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -183,7 +183,7 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; - if (atomic_long_read(&mem->nr_hwpoison)) + if (memblk_nr_poison(start_pfn)) return -EHWPOISON; zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, @@ -867,7 +867,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; - clear_hwpoisoned_pages(atomic_long_read(&mem->nr_hwpoison)); + clear_hwpoisoned_pages(memblk_nr_poison(start)); unregister_memory_block_under_nodes(mem); remove_memory_block(mem); } @@ -1199,4 +1199,9 @@ unsigned long memblk_nr_poison(unsigned long pfn) return 0; } +#else +unsigned long memblk_nr_poison(unsigned long pfn) +{ + return 0; +} #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 2fe42bb9a517a..5445943bbb4b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3282,7 +3282,6 @@ extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); extern void num_poisoned_pages_inc(unsigned long pfn); extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); -extern unsigned long memblk_nr_poison(unsigned long pfn); extern void clear_hwpoisoned_pages(long nr_poison); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) @@ -3298,6 +3297,7 @@ static inline void clear_hwpoisoned_pages(long nr_poison) { } #endif +extern unsigned long memblk_nr_poison(unsigned long pfn); #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) From 7eea8711e650b45d1aee8009e6e29b3f46de2387 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 20 Sep 2022 16:35:30 +0000 Subject: [PATCH 289/352] mm/damon/sysfs: return 'err' value when call kstrtoul() failed We had better return the 'err' value when calling kstrtoul() failed, so the user will know why it really fails, there do little change, let it return the 'err' value when failed. Link: https://lkml.kernel.org/r/6329ebe0.050a0220.ec4bd.297cSMTPIN_ADDED_BROKEN@mx.google.com Suggested-by: SeongJae Park Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index cc22c155f3583..9f1219a67e3f1 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -58,7 +58,7 @@ static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &min); if (err) - return -EINVAL; + return err; range->min = min; return count; @@ -83,7 +83,7 @@ static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &max); if (err) - return -EINVAL; + return err; range->max = max; return count; @@ -291,9 +291,7 @@ static ssize_t interval_us_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->interval_us); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t high_show(struct kobject *kobj, @@ -312,9 +310,7 @@ static ssize_t high_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->high); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t mid_show(struct kobject *kobj, @@ -333,9 +329,7 @@ static ssize_t mid_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->mid); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t low_show(struct kobject *kobj, @@ -354,9 +348,7 @@ static ssize_t low_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->low); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_watermarks_release(struct kobject *kobj) @@ -437,9 +429,7 @@ static ssize_t sz_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->sz); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t nr_accesses_permil_show(struct kobject *kobj, @@ -458,9 +448,7 @@ static ssize_t nr_accesses_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->nr_accesses); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t age_permil_show(struct kobject *kobj, @@ -479,9 +467,7 @@ static ssize_t age_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->age); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_weights_release(struct kobject *kobj) @@ -1111,9 +1097,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->start); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1132,9 +1116,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->end); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_region_release(struct kobject *kobj) @@ -1528,7 +1510,7 @@ static ssize_t sample_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->sample_us = us; return count; @@ -1552,7 +1534,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->aggr_us = us; return count; @@ -1576,7 +1558,7 @@ static ssize_t update_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->update_us = us; return count; From 1e1a891addbb929e1ed42e8ce2e4fdc0f26434f3 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 20 Sep 2022 16:53:22 +0000 Subject: [PATCH 290/352] mm/damon: deduplicate damon_{reclaim,lru_sort}_apply_parameters() The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain duplicates. This commit adds a common function damon_set_region_biggest_system_ram_default() to remove the duplicates. Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Kaixu Xia Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 35 ++++++++++++++++++++++++++++++++++- mm/damon/lru_sort.c | 13 +++---------- mm/damon/reclaim.c | 13 +++---------- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index e7808a84675fb..ed5470f50babd 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 9c80c6eb00c24..4de8c7c529794 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,7 +1245,8 @@ static int walk_system_ram(struct resource *res, void *arg) * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. */ -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) { struct damon_system_ram_region arg = {}; @@ -1259,6 +1260,38 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index d7eb72b41cb67..efbc2bda8b9cd 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -188,7 +188,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -211,15 +210,9 @@ static int damon_lru_sort_apply_parameters(void) return -ENOMEM; damon_add_scheme(ctx, scheme); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_lru_sort_turn(bool on) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3d59ab11b7b39..162c9b1ca00fd 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -144,7 +144,6 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; int err = 0; err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); @@ -157,15 +156,9 @@ static int damon_reclaim_apply_parameters(void) return -ENOMEM; damon_set_schemes(ctx, &scheme, 1); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_reclaim_turn(bool on) From a508f1526bbd14436bcc2a1f41e2a94944c24e0d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:33 +0800 Subject: [PATCH 291/352] mm: memcontrol: use memcg_kmem_enabled in count_objcg_event Patch series "mm: memcontrol: cleanup and optimize for two accounting params", v2. This patch (of 2): There are currently two helpers for checking if cgroup kmem accounting is enabled: - mem_cgroup_kmem_disabled - memcg_kmem_enabled mem_cgroup_kmem_disabled is a simple helper that returns true if cgroup.memory=nokmem is specified, otherwise returns false. memcg_kmem_enabled is a bit different, it returns true if cgroup.memory=nokmem is not specified and there was at least one non-root memory control enabled cgroup ever created. This help improve performance when kmem accounting was not actually activated. And it's optimized with static branch. The usage of mem_cgroup_kmem_disabled is for sub-systems that need to preallocate data for kmem accounting since they could be initialized before kmem accounting is activated. But count_objcg_event doesn't need that, so using memcg_kmem_enabled is better here. Link: https://lkml.kernel.org/r/20220919180634.45958-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20220919180634.45958-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3fd50420d0ace..e1644a24009c8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1778,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (mem_cgroup_kmem_disabled()) + if (!memcg_kmem_enabled()) return; rcu_read_lock(); From 566b4a7fb135c257b4921c896dc96d9359bcebf2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:34 +0800 Subject: [PATCH 292/352] mm: memcontrol: make cgroup_memory_noswap a static key cgroup_memory_noswap is used in many hot path, so make it a static key to lower the kernel overhead. Using 8G of ZRAM as SWAP, benchmark using `perf stat -d -d -d --repeat 100` with the following code snip in a non-root cgroup: #include #include #include #include #define MB 1024UL * 1024UL int main(int argc, char **argv){ void *p = mmap(NULL, 8000 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0xff, 8000 * MB); madvise(p, 8000 * MB, MADV_PAGEOUT); memset(p, 0xff, 8000 * MB); return 0; } Before: 7,021.43 msec task-clock # 0.967 CPUs utilized ( +- 0.03% ) 4,010 context-switches # 573.853 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,057 page-faults # 293.661 K/sec ( +- 0.00% ) 12,616,546,027 cycles # 1.805 GHz ( +- 0.06% ) (39.92%) 156,823,666 stalled-cycles-frontend # 1.25% frontend cycles idle ( +- 0.10% ) (40.25%) 310,130,812 stalled-cycles-backend # 2.47% backend cycles idle ( +- 4.39% ) (40.73%) 18,692,516,591 instructions # 1.49 insn per cycle # 0.01 stalled cycles per insn ( +- 0.04% ) (40.75%) 4,907,447,976 branches # 702.283 M/sec ( +- 0.05% ) (40.30%) 13,002,578 branch-misses # 0.26% of all branches ( +- 0.08% ) (40.48%) 7,069,786,296 L1-dcache-loads # 1.012 G/sec ( +- 0.03% ) (40.32%) 649,385,847 L1-dcache-load-misses # 9.13% of all L1-dcache accesses ( +- 0.07% ) (40.10%) 1,485,448,688 L1-icache-loads # 212.576 M/sec ( +- 0.15% ) (39.49%) 31,628,457 L1-icache-load-misses # 2.13% of all L1-icache accesses ( +- 0.40% ) (39.57%) 6,667,311 dTLB-loads # 954.129 K/sec ( +- 0.21% ) (39.50%) 5,668,555 dTLB-load-misses # 86.40% of all dTLB cache accesses ( +- 0.12% ) (39.03%) 765 iTLB-loads # 109.476 /sec ( +- 21.81% ) (39.44%) 4,370,351 iTLB-load-misses # 214320.09% of all iTLB cache accesses ( +- 1.44% ) (39.86%) 149,207,254 L1-dcache-prefetches # 21.352 M/sec ( +- 0.13% ) (40.27%) 7.25869 +- 0.00203 seconds time elapsed ( +- 0.03% ) After: 6,576.16 msec task-clock # 0.953 CPUs utilized ( +- 0.10% ) 4,020 context-switches # 605.595 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,056 page-faults # 309.133 K/sec ( +- 0.00% ) 11,967,619,180 cycles # 1.803 GHz ( +- 0.36% ) (38.76%) 161,259,240 stalled-cycles-frontend # 1.38% frontend cycles idle ( +- 0.27% ) (36.58%) 253,605,302 stalled-cycles-backend # 2.16% backend cycles idle ( +- 4.45% ) (34.78%) 19,328,171,892 instructions # 1.65 insn per cycle # 0.01 stalled cycles per insn ( +- 0.10% ) (31.46%) 5,213,967,902 branches # 785.461 M/sec ( +- 0.18% ) (30.68%) 12,385,170 branch-misses # 0.24% of all branches ( +- 0.26% ) (34.13%) 7,271,687,822 L1-dcache-loads # 1.095 G/sec ( +- 0.12% ) (35.29%) 649,873,045 L1-dcache-load-misses # 8.93% of all L1-dcache accesses ( +- 0.11% ) (41.41%) 1,950,037,608 L1-icache-loads # 293.764 M/sec ( +- 0.33% ) (43.11%) 31,365,566 L1-icache-load-misses # 1.62% of all L1-icache accesses ( +- 0.39% ) (45.89%) 6,767,809 dTLB-loads # 1.020 M/sec ( +- 0.47% ) (48.42%) 6,339,590 dTLB-load-misses # 95.43% of all dTLB cache accesses ( +- 0.50% ) (46.60%) 736 iTLB-loads # 110.875 /sec ( +- 1.79% ) (48.60%) 4,314,836 iTLB-load-misses # 518653.73% of all iTLB cache accesses ( +- 0.63% ) (42.91%) 144,950,156 L1-dcache-prefetches # 21.836 M/sec ( +- 0.37% ) (41.39%) 6.89935 +- 0.00703 seconds time elapsed ( +- 0.10% ) The performance is clearly better. There is no significant hotspot improvement according to perf report, as there are quite a few callers of memcg_swap_enabled and do_memsw_account (which calls memcg_swap_enabled). Many pieces of minor optimizations resulted in lower overhead for the branch predictor, and bettter performance. Link: https://lkml.kernel.org/r/20220919180634.45958-3-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac6440daf2086..6b74bbdc26596 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -90,9 +90,18 @@ static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; +static bool cgroup_memory_noswap __initdata; + +static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); +static inline bool memcg_swap_enabled(void) +{ + return static_branch_likely(&memcg_swap_enabled_key); +} #else -#define cgroup_memory_noswap 1 +static inline bool memcg_swap_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -102,7 +111,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7370,7 +7379,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg_swap_enabled() && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7422,7 +7431,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7454,7 +7463,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7470,7 +7479,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7487,7 +7496,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = folio_memcg(folio); @@ -7795,6 +7804,8 @@ static int __init mem_cgroup_swap_init(void) if (cgroup_memory_noswap) return 0; + static_branch_enable(&memcg_swap_enabled_key); + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) From 784a09e3dc71453ea0c550109283d948c76141c7 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 12:21:13 +0800 Subject: [PATCH 293/352] mm: hugetlb: fix UAF in hugetlb_handle_userfault The vma_lock and hugetlb_fault_mutex are dropped before handling userfault and reacquire them again after handle_userfault(), but reacquire the vma_lock could lead to UAF[1,2] due to the following race, hugetlb_fault hugetlb_no_page /*unlock vma_lock */ hugetlb_handle_userfault handle_userfault /* unlock mm->mmap_lock*/ vm_mmap_pgoff do_mmap mmap_region munmap_vma_range /* clean old vma */ /* lock vma_lock again <--- UAF */ /* unlock vma_lock */ Since the vma_lock will unlock immediately after hugetlb_handle_userfault(), let's drop the unneeded lock and unlock in hugetlb_handle_userfault() to fix the issue. [1] https://lore.kernel.org/linux-mm/000000000000d5e00a05e834962e@google.com/ [2] https://lore.kernel.org/linux-mm/20220921014457.1668-1-liuzixian4@huawei.com/ Link: https://lkml.kernel.org/r/20220923042113.137273-1-liushixin2@huawei.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Signed-off-by: Liu Shixin Signed-off-by: Kefeng Wang Reported-by: syzbot+193f9cee8638750b23cf@syzkaller.appspotmail.com Reported-by: Liu Zixian Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Muchun Song Cc: Sidhartha Kumar Cc: [4.14+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3b9d03c1970c4..12370be17fc8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5489,7 +5489,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, unsigned long addr, unsigned long reason) { - vm_fault_t ret; u32 hash; struct vm_fault vmf = { .vma = vma, @@ -5507,18 +5506,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * vma_lock and hugetlb_fault_mutex must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - ret = handle_userfault(&vmf, reason); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vma_lock_read(vma); - - return ret; + return handle_userfault(&vmf, reason); } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, @@ -5536,6 +5531,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the @@ -5546,7 +5542,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; } /* @@ -5560,12 +5556,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) { - ret = hugetlb_handle_userfault(vma, mapping, idx, + if (userfaultfd_missing(vma)) + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MISSING); - goto out; - } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5631,10 +5625,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - ret = hugetlb_handle_userfault(vma, mapping, idx, + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MINOR); - goto out; } } @@ -5692,6 +5685,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: @@ -5789,11 +5784,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); - goto out_mutex; - } ret = 0; From 0eca727499f56fa8cc37e016a3be8a25c631fd79 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:27:31 -0700 Subject: [PATCH 294/352] mm/khugepaged: check compound_order() in collapse_pte_mapped_thp() By the time we lock a page in collapse_pte_mapped_thp(), the page mapped by the address pushed onto the slot's .pte_mapped_thp[] array might have changed arbitrarily since we last looked at it. We revalidate that the page is still the head of a compound page, but we don't revalidate if the compound page is of order HPAGE_PMD_ORDER before applying rmap and page table updates. Since the kernel now supports large folios of arbitrary order, and since replacing page's pte mappings by a pmd mapping only makes sense for compound pages of order HPAGE_PMD_ORDER, revalidate that the compound order is indeed of order HPAGE_PMD_ORDER before proceeding. Link: https://lore.kernel.org/linux-mm/CAHbLzkon+2ky8v9ywGcsTUgXM_B35jt5NThYqQKXW2YV_GUacw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922222731.1124481-1-zokeefe@google.com Signed-off-by: Zach O'Keefe Suggested-by: Yang Shi Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 57af2c841b410..40fd9f7b3ed3b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,6 +1399,9 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; + if (compound_order(hpage) != HPAGE_PMD_ORDER) + goto drop_hpage; + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; From 446f5725bf3cbc5ad6846d44255c0cd98fd2ab8c Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:50 -0700 Subject: [PATCH 295/352] mm/madvise: MADV_COLLAPSE return EAGAIN when page cannot be isolated MADV_COLLAPSE is a best-effort request that attempts to set an actionable errno value if the request cannot be fulfilled at the time. EAGAIN should be used to communicate that a resource was temporarily unavailable, but that the user may try again immediately. SCAN_DEL_PAGE_LRU is an internal result code used when a page cannot be isolated from it's LRU list. Since this, like SCAN_PAGE_LRU, is likely a transitory state, make MADV_COLLAPSE return EAGAIN so that users know they may reattempt the operation. Another important scenario to consider is race with khugepaged. khugepaged might isolate a page while MADV_COLLAPSE is interested in it. Even though racing with khugepaged might mean that the memory has already been collapsed, signalling an errno that is non-intrinsic to that memory or arguments provided to madvise(2) lets the user know that future attempts might (and in this case likely would) succeed, and avoids false-negative assumptions by the user. Link: https://lkml.kernel.org/r/20220922184651.1016461-1-zokeefe@google.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40fd9f7b3ed3b..b3ebe90a66d99 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2372,6 +2372,7 @@ static int madvise_collapse_errno(enum scan_result r) /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to @@ -2454,6 +2455,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: From 10bda7335f59305a71fbe979be12c9ab6c7c57f1 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:51 -0700 Subject: [PATCH 296/352] selftests/vm: retry on EAGAIN for MADV_COLLAPSE selftest MADV_COLLAPSE is a best-effort request that will set errno to an actionable value if the request cannot be performed. For example, if pages are not found on the LRU, or if they are currently locked by something else, MADV_COLLAPSE will fail and set errno to EAGAIN to inform callers that they may try again. Since the khugepaged selftest is the first public use of MADV_COLLAPSE, set a best practice of checking errno and retrying on EAGAIN. Link: https://lkml.kernel.org/r/20220922184651.1016461-2-zokeefe@google.com Fixes: 9330694de59f ("selftests/vm: add MADV_COLLAPSE collapse context to selftests") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b77b1e28cdb38..b55dc331af139 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,4 +1,5 @@ #define _GNU_SOURCE +#include #include #include #include @@ -477,6 +478,26 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + /* * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. @@ -531,7 +552,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, /* Clear VM_NOHUGEPAGE */ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); else if (check_huge(p, nr_hpages) != expect) From 073671ea447df5e36a93e5d6fcecc6b5bfe94fd4 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:37 -0700 Subject: [PATCH 297/352] mm/shmem: add flag to enforce shmem THP in hugepage_vma_check() Patch series "mm: add file/shmem support to MADV_COLLAPSE", v4. This series builds on top of the previous "mm: userspace hugepage collapse" series which introduced the MADV_COLLAPSE madvise mode and added support for private, anonymous mappings[2], by adding support for file and shmem backed memory to CONFIG_READ_ONLY_THP_FOR_FS=y kernels. File and shmem support have been added with effort to align with existing MADV_COLLAPSE semantics and policy decisions[3]. Collapse of shmem-backed memory ignores kernel-guiding directives and heuristics including all sysfs settings (transparent_hugepage/shmem_enabled), and tmpfs huge= mount options (shmem always supports large folios). Like anonymous mappings, on successful return of MADV_COLLAPSE on file/shmem memory, the contents of memory mapped by the addresses provided will be synchronously pmd-mapped THPs. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. khugepaged has received a small improvement by association and can now detect and collapse pte-mapped THPs. However, there is still work to be done along the file collapse path. Compound pages of arbitrary order still needs to be supported and THP collapse needs to be converted to using folios in general. Eventually, we'd like to move away from the read-only and executable-mapped constraints currently imposed on eligible files and support any inode claiming huge folio support. That said, I think the series as-is covers enough to claim that MADV_COLLAPSE supports file/shmem memory. Patches 1-3 Implement the guts of the series. Patch 4 Is a tracepoint for debugging. Patches 5-9 Refactor existing khugepaged selftests to work with new memory types + new collapse tests. Patch 10 Adds a userfaultfd selftest mode to mimic a functional test of UFFDIO_REGISTER_MODE_MINOR+MADV_COLLAPSE live migration. (v4 note: "userfaultfd shmem" selftest is failing as of Sep 22 mm-unstable) [1] https://lore.kernel.org/linux-mm/YyiK8YvVcrtZo0z3@google.com/ [2] https://lore.kernel.org/linux-mm/20220706235936.2197195-1-zokeefe@google.com/ [3] https://lore.kernel.org/linux-mm/YtBmhaiPHUTkJml8@google.com/ [4] https://lore.kernel.org/linux-mm/20220922222731.1124481-1-zokeefe@google.com/ [5] https://lore.kernel.org/linux-mm/20220922184651.1016461-1-zokeefe@google.com/ This patch (of 10): Extend 'mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()' to shmem, allowing callers to ignore /sys/kernel/transparent_hugepage/shmem_enabled and tmpfs huge= mount. This is intended to be used by MADV_COLLAPSE, and the rationale is analogous to the anon/file case: MADV_COLLAPSE is not coupled to directives that advise the kernel's decisions on when THPs should be considered eligible. shmem/tmpfs always claims large folio support, regardless of sysfs or mount options. [shy828301@gmail.com: test shmem_huge_force explicitly] Link: https://lore.kernel.org/linux-mm/CAHbLzko3A5-TpS0BgBeKkx5cuOkWgLvWXQH=TdgW-baO4rPtdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922224046.1143204-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-2-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-2-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++++++---- mm/huge_memory.c | 2 +- mm/shmem.c | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f24071e3c826e..d500ea967dc73 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,11 +92,13 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma, + bool shmem_huge_force) { - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, + shmem_huge_force); } extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4938defe4e732..1cc4a5f4791e9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); + return shmem_huge_enabled(vma, !enforce_sysfs); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && diff --git a/mm/shmem.c b/mm/shmem.c index 275899bacbeaf..cabe48d55a64d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -462,20 +462,22 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge_force) + return true; if (shmem_huge == SHMEM_HUGE_FORCE) return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: @@ -670,8 +672,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { return false; } @@ -1058,7 +1060,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0)) + if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1900,7 +1902,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, return 0; } - if (!shmem_is_huge(vma, inode, index)) + if (!shmem_is_huge(vma, inode, index, false)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); From b5895f4924f41630195420b28a1dbedf11522025 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:38 -0700 Subject: [PATCH 298/352] mm/khugepaged: attempt to map file/shmem-backed pte-mapped THPs by pmds The main benefit of THPs are that they can be mapped at the pmd level, increasing the likelihood of TLB hit and spending less cycles in page table walks. pte-mapped hugepages - that is - hugepage-aligned compound pages of order HPAGE_PMD_ORDER mapped by ptes - although being contiguous in physical memory, don't have this advantage. In fact, one could argue they are detrimental to system performance overall since they occupy a precious hugepage-aligned/sized region of physical memory that could otherwise be used more effectively. Additionally, pte-mapped hugepages can be the cheapest memory to collapse for khugepaged since no new hugepage allocation or copying of memory contents is necessary - we only need to update the mapping page tables. In the anonymous collapse path, we are able to collapse pte-mapped hugepages (albeit, perhaps suboptimally), but the file/shmem path makes no effort when compound pages (of any order) are encountered. Identify pte-mapped hugepages in the file/shmem collapse path. The final step of which makes a racy check of the value of the pmd to ensure it maps a pte table. This should be fine, since races that result in false-positive (i.e. attempt collapse even though we shouldn't) will fail later in collapse_pte_mapped_thp() once we actually lock mmap_lock and reinspect the pmd value. Races that result in false-negatives (i.e. where we decide to not attempt collapse, but should have) shouldn't be an issue, since in the worst case, we do nothing - which is what we've done up to this point. We make a similar check in retract_page_tables(). If we do think we've found a pte-mapped hugepgae in khugepaged context, attempt to update page tables mapping this hugepage. Note that these collapses still count towards the /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed counter, and if the pte-mapped hugepage was also mapped into multiple process' address spaces, could be incremented for each page table update. Since we increment the counter when a pte-mapped hugepage is successfully added to the list of to-collapse pte-mapped THPs, it's possible that we never actually update the page table either. This is different from how file/shmem pages_collapsed accounting works today where only a successful page cache update is counted (it's also possible here that no page tables are actually changed). Though it incurs some slop, this is preferred to either not accounting for the event at all, or plumbing through data in struct mm_slot on whether to account for the collapse or not. Also note that work still needs to be done to support arbitrary compound pages, and that this should all be converted to using folios. [shy828301@gmail.com: Spelling mistake, update comment, and add Documentation] Link: https://lore.kernel.org/linux-mm/CAHbLzkpHwZxFzjfX9nxVoRhzup8WMjMfyL6Xiq8mZ9M-N3ombw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-3-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-3-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 9 ++- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 69 +++++++++++++++++++--- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8e3418ec4503e..8ee78ec232ebc 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt:: /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs -The khugepaged progress can be seen in the number of pages collapsed:: +The khugepaged progress can be seen in the number of pages collapsed (note +that this counter may not be an exact count of the number of pages +collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping +being replaced by a PMD mapping, or (2) All 4K physical pages replaced by +one 2M hugepage. Each may happen independently, or together, depending on +the type of memory and the failures that occur. As such, this value should +be interpreted roughly as a sign of progress, and counters in /proc/vmstat +consulted for more accurate accounting):: /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 55392bf30a034..fbbb25494d603 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -17,6 +17,7 @@ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ + EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b3ebe90a66d99..b1e3f83c4eb2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -35,6 +35,7 @@ enum scan_result { SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -1320,20 +1321,24 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ -static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; + bool ret = false; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } spin_unlock(&khugepaged_mm_lock); + return ret; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1370,9 +1375,16 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) pte_t *start_pte, *pte; pmd_t *pmd; spinlock_t *ptl; - int count = 0; + int count = 0, result = SCAN_FAIL; int i; + mmap_assert_write_locked(mm); + + /* Fast check before locking page if not PMD mapping PTE table */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result != SCAN_SUCCEED) + return; + if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; @@ -1726,9 +1738,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; goto out_unlock; } @@ -1962,11 +1981,23 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, } /* - * XXX: khugepaged should compact smaller compound pages + * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ break; } @@ -2026,6 +2057,12 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + return false; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, @@ -2118,8 +2155,26 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: ++khugepaged_pages_collapsed; + break; + default: + break; + } + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; From 4776e845c29ab1e5986e9e7dfbcb6756aee23f06 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:39 -0700 Subject: [PATCH 299/352] mm/madvise: add file and shmem support to MADV_COLLAPSE Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y). On success, the backing memory will be a hugepage. For the memory range and process provided, the page tables will synchronously have a huge pmd installed, mapping the THP. Other mappings of the file extent mapped by the memory range may be added to a set of entries that khugepaged will later process and attempt update their page tables to map the THP by a pmd. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Since khugepaged is single threaded, this change now introduces possibility of collapse contexts racing in file collapse path. There a important few places to consider: (1) hpage_collapse_scan_file(), when we xas_pause() and drop RCU. We could have the memory collapsed out from under us, but the next xas_for_each() iteration will correctly pick up the hugepage. The hugepage might not be up to date (insofar as copying of small page contents might not have completed - the page still may be locked), but regardless what small page index we were iterating over, we'll find the hugepage and identify it as a suitably aligned compound page of order HPAGE_PMD_ORDER. In khugepaged path, we locklessly check the value of the pmd, and only add it to deferred collapse array if we find pmd mapping pte table. This is fine, since other values that could have raced in right afterwards denote failure, or that the memory was successfully collapsed, so we don't need further processing. In madvise path, we'll take mmap_lock() in write to serialize against page table updates and will know what to do based on the true value of the pmd: recheck all ptes if we point to a pte table, directly install the pmd, if the pmd has been cleared, but memory not yet faulted, or nothing at all if we find a huge pmd. It's worth putting emphasis here on how we treat the none pmd here. If khugepaged has processed this mm's page tables already, it will have left the pmd cleared (ready for refault by the process). Depending on the VMA flags and sysfs settings, amount of RAM on the machine, and the current load, could be a relatively common occurrence - and as such is one we'd like to handle successfully in MADV_COLLAPSE. When we see the none pmd in collapse_pte_mapped_thp(), we've locked mmap_lock in write and checked (a) huepaged_vma_check() to see if the backing memory is appropriate still, along with VMA sizing and appropriate hugepage alignment within the file, and (b) we've found a hugepage head of order HPAGE_PMD_ORDER at the offset in the file mapped by our hugepage-aligned virtual address. Even though the common-case is likely race with khugepaged, given these checks (regardless how we got here - we could be operating on a completely different file than originally checked in hpage_collapse_scan_file() for all we know) it should be safe to directly make the pmd a huge pmd pointing to this hugepage. (2) collapse_file() is mostly serialized on the same file extent by lock sequence: | lock hupepage | lock mapping->i_pages | lock 1st page | unlock mapping->i_pages | | lock mapping->i_pages | page_ref_freeze(3) | xas_store(hugepage) | unlock mapping->i_pages | page_ref_unfreeze(1) | unlock 1st page V unlock hugepage Once a context (who already has their fresh hugepage locked) locks mapping->i_pages exclusively, it will hold said lock until it locks the first page, and it will hold that lock until the after the hugepage has been added to the page cache (and will unlock the hugepage after page table update, though that isn't important here). A racing context that loses the race for mapping->i_pages will then lose the race to locking the first page. Here - depending on how far the other racing context has gotten - we might find the new hugepage (in which case we'll exit cleanly when we check PageTransCompound()), or we'll find the "old" 1st small page (in which we'll exit cleanly when we discover unexpected refcount of 2 after isolate_lru_page()). This is assuming we are able to successfully lock the page we find - in shmem path, we could just fail the trylock and exit cleanly anyways. Failure path in collapse_file() is similar: once we hold lock on 1st small page, we are serialized against other collapse contexts. Before the 1st small page is unlocked, we add it back to the pagecache and unfreeze the refcount appropriately. Contexts who lost the race to the 1st small page will then find the same 1st small page with the correct refcount and will be able to proceed. [shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove check for multi-add in khugepaged_add_pte_mapped_thp()] Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 13 +- include/trace/events/huge_memory.h | 1 + kernel/events/uprobes.c | 2 +- mm/khugepaged.c | 245 ++++++++++++++++++++++------- 4 files changed, 199 insertions(+), 62 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 384f034ae947f..70162d707caf0 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM -extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); #else -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } #endif @@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index fbbb25494d603..df33453b70fcf 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_NONE, "pmd_none") \ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e0a9b945e7bc0..d9e357b7e17c9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -555,7 +555,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b1e3f83c4eb2d..a272fc065dd72 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_NONE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -821,6 +822,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { @@ -845,8 +847,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * hugepage_vma_check may return true for qualified file * vmas. */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return SCAN_VMA_CHECK; + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; return SCAN_SUCCEED; } @@ -866,8 +868,8 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ barrier(); #endif - if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + if (pmd_none(pmde)) + return SCAN_PMD_NONE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) @@ -995,7 +997,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1026,7 +1028,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1320,6 +1322,26 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. */ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) @@ -1341,6 +1363,27 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return ret; } +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; +} + static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -1362,12 +1405,14 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v * * @mm: process address space where collapse happens * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. + * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ -void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); @@ -1380,14 +1425,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) mmap_assert_write_locked(mm); - /* Fast check before locking page if not PMD mapping PTE table */ + /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); - if (result != SCAN_SUCCEED) - return; + if (result == SCAN_PMD_MAPPED) + return result; if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return; + return SCAN_VMA_CHECK; /* * If we are here, we've succeeded in replacing all the native pages @@ -1397,27 +1442,45 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * analogously elide sysfs THP settings here. */ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) - return; + return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) - return; + return SCAN_PTE_UFFD_WP; hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) - return; + return SCAN_PAGE_NULL; - if (!PageHead(hpage)) + if (!PageHead(hpage)) { + result = SCAN_FAIL; goto drop_hpage; + } - if (compound_order(hpage) != HPAGE_PMD_ORDER) + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; goto drop_hpage; + } if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: goto drop_hpage; + } start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1429,8 +1492,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; goto abort; + } page = vm_normal_page(vma, addr, *pte); if (WARN_ON_ONCE(page && is_zone_device_page(page))) @@ -1465,12 +1530,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); } - /* step 4: collapse pmd */ + /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + drop_hpage: unlock_page(hpage); put_page(hpage); - return; + return result; abort: pte_unmap_unlock(start_pte, ptl); @@ -1493,22 +1565,29 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); } -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr; - pmd_t *pmd; + int target_result = SCAN_FAIL; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing @@ -1525,24 +1604,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * ptl. It has higher chance to recover THP for the VMA, but * has higher cost too. */ - if (vma->anon_vma) - continue; + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto next; + } addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) - continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) - continue; + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } mm = vma->vm_mm; - if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) - continue; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; /* * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. */ - if (mmap_write_trylock(mm)) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte @@ -1551,22 +1640,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!hpage_collapse_test_exit(mm) && - !userfaultfd_wp(vma)) - collapse_and_free_pmd(mm, vma, addr, pmd); + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: mmap_write_unlock(mm); - } else { - /* Try again later */ + goto next; + } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { khugepaged_add_pte_mapped_thp(mm, addr); + continue; } +next: + if (is_target) + target_result = result; } i_mmap_unlock_write(mapping); + return target_result; } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens + * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad @@ -1586,8 +1698,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *hpage; @@ -1895,7 +2008,8 @@ static int collapse_file(struct mm_struct *mm, struct file *file, /* * Remove pte page tables, so we can re-fault the page as huge. */ - retract_page_tables(mapping, start); + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); unlock_page(hpage); hpage = NULL; } else { @@ -1951,8 +2065,9 @@ static int collapse_file(struct mm_struct *mm, struct file *file, return result; } -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -2040,7 +2155,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - result = collapse_file(mm, file, start, cc); + result = collapse_file(mm, addr, file, start, cc); } } @@ -2048,8 +2163,9 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, return result; } #else -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } @@ -2145,8 +2261,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.address); mmap_read_unlock(mm); - *result = khugepaged_scan_file(mm, file, pgoff, - cc); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); mmap_locked = false; fput(file); } else { @@ -2453,10 +2570,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - /* TODO: Support file/shmem */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return -EINVAL; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; @@ -2479,7 +2592,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, cond_resched(); mmap_read_lock(mm); mmap_locked = true; - result = hugepage_vma_revalidate(mm, addr, &vma, cc); + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; @@ -2489,16 +2603,35 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); - result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, - cc); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ +handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: From 4e2b2d4b652567e9af9df5e35e0b31e335b33cf8 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:40 -0700 Subject: [PATCH 300/352] mm/khugepaged: add tracepoint to hpage_collapse_scan_file() Add huge_memory:trace_mm_khugepaged_scan_file tracepoint to hpage_collapse_scan_file() analogously to hpage_collapse_scan_pmd(). While this change is targeted at debugging MADV_COLLAPSE pathway, the "mm_khugepaged" prefix is retained for symmetry with huge_memory:trace_mm_khugepaged_scan_pmd, which retains it's legacy name to prevent changing kernel ABI as much as possible. Link: https://lkml.kernel.org/r/20220907144521.3115321-5-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-5-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 34 ++++++++++++++++++++++++++++++ mm/khugepaged.c | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index df33453b70fcf..935af49479173 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -169,5 +169,39 @@ TRACE_EVENT(mm_collapse_huge_page_swapin, __entry->ret) ); +TRACE_EVENT(mm_khugepaged_scan_file, + + TP_PROTO(struct mm_struct *mm, struct page *page, const char *filename, + int present, int swap, int result), + + TP_ARGS(mm, page, filename, present, swap, result), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, pfn) + __string(filename, filename) + __field(int, present) + __field(int, swap) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->pfn = page ? page_to_pfn(page) : -1; + __assign_str(filename, filename); + __entry->present = present; + __entry->swap = swap; + __entry->result = result; + ), + + TP_printk("mm=%p, scan_pfn=0x%lx, filename=%s, present=%d, swap=%d, result=%s", + __entry->mm, + __entry->pfn, + __get_str(filename), + __entry->present, + __entry->swap, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a272fc065dd72..6d2dfd96608a0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2159,7 +2159,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname, + present, swap, result); return result; } #else From 44afdcf5f9775c6b4884c85477994a662af64d5f Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:41 -0700 Subject: [PATCH 301/352] selftests/vm: dedup THP helpers These files: tools/testing/selftests/vm/vm_util.c tools/testing/selftests/vm/khugepaged.c Both contain logic to: 1) Determine hugepage size on current system 2) Read /proc/self/smaps to determine number of THPs at an address Refactor selftests/vm/khugepaged.c to use the vm_util common helpers and add it as a build dependency. Since selftests/vm/khugepaged.c is the largest user of check_huge(), change the signature of check_huge() to match selftests/vm/khugepaged.c's useage: take an expected number of hugepages, and return a bool indicating if the correct number of hugepages were found. Add a wrapper, check_huge_anon(), in anticipation of checking smaps for file and shmem hugepages. Update existing callsites to use the new pattern / function. Likewise, check_for_pattern() was duplicated, and it's a general enough helper to include in vm_util helpers as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-6-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-6-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/khugepaged.c | 64 ++----------------- tools/testing/selftests/vm/soft-dirty.c | 2 +- .../selftests/vm/split_huge_page_test.c | 12 ++-- tools/testing/selftests/vm/vm_util.c | 26 +++++--- tools/testing/selftests/vm/vm_util.h | 3 +- 6 files changed, 32 insertions(+), 76 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 4ae879f70f4c3..c9c0996c122b4 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -95,6 +95,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk +$(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b55dc331af139..235a64b4458c3 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -12,6 +12,8 @@ #include #include +#include "vm_util.h" + #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif @@ -352,64 +354,12 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 - -static bool check_for_pattern(FILE *fp, char *pattern, char *buf) -{ - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - static bool check_huge(void *addr, int nr_hpages) { - bool thp = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer)) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - nr_hpages * (hpage_pmd_size >> 10)); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the AnonHugePages: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - thp = true; -err_out: - fclose(fp); - return thp; + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } - +#define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -431,7 +381,7 @@ static bool check_swap(void *addr, unsigned long size) printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); exit(EXIT_FAILURE); } - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", @@ -444,7 +394,7 @@ static bool check_swap(void *addr, unsigned long size) * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. */ - if (!check_for_pattern(fp, "Swap:", buffer)) + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) goto err_out; if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) @@ -1066,7 +1016,7 @@ int main(int argc, const char **argv) setbuf(stdout, NULL); page_size = getpagesize(); - hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_size = read_pmd_pagesize(); hpage_pmd_nr = hpage_pmd_size / page_size; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index e3a43f5d4fa2b..21d8830c5f243 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -91,7 +91,7 @@ static void test_hugepage(int pagemap_fd, int pagesize) for (i = 0; i < hpage_len; i++) map[i] = (char)i; - if (check_huge(map)) { + if (check_huge_anon(map, 1, hpage_len)) { ksft_test_result_pass("Test %s huge page allocation\n", __func__); clear_softdirty(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 6aa2b8253aeda..76e1c36dd9e57 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -92,7 +92,6 @@ void split_pmd_thp(void) { char *one_page; size_t len = 4 * pmd_pagesize; - uint64_t thp_size; size_t i; one_page = memalign(pmd_pagesize, len); @@ -107,8 +106,7 @@ void split_pmd_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } @@ -124,9 +122,8 @@ void split_pmd_thp(void) } - thp_size = check_huge(one_page); - if (thp_size) { - printf("Still %ld kB AnonHugePages not split\n", thp_size); + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); } @@ -172,8 +169,7 @@ void split_pte_mapped_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index b58ab11a7a302..9dae51b8219f1 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -42,9 +42,9 @@ void clear_softdirty(void) ksft_exit_fail_msg("writing clear_refs failed\n"); } -static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) { - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + while (fgets(buf, len, fp)) { if (!strncmp(buf, pattern, strlen(pattern))) return true; } @@ -72,9 +72,10 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } -uint64_t check_huge(void *addr) +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) { - uint64_t thp = 0; + uint64_t thp = -1; int ret; FILE *fp; char buffer[MAX_LINE_LENGTH]; @@ -89,20 +90,27 @@ uint64_t check_huge(void *addr) if (!fp) ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; /* - * Fetch the AnonHugePages: in the same block and check the number of + * Fetch the pattern in the same block and check the number of * hugepages. */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) goto err_out; - if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) ksft_exit_fail_msg("Reading smap error\n"); err_out: fclose(fp); - return thp; + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 2e512bd57ae14..8434ea0c95cd7 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -5,5 +5,6 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); -uint64_t check_huge(void *addr); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); From 10f42bd310261255e5399555e7f2f0cd25f82cf9 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:42 -0700 Subject: [PATCH 302/352] selftests/vm: modularize thp collapse memory operations Modularize operations to setup, cleanup, fault, and check for huge pages, for a given memory type. This allows reusing existing tests with additional memory types by defining new memory operations. Following patches will add file and shmem memory types. Link: https://lkml.kernel.org/r/20220907144521.3115321-7-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-7-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 373 +++++++++++++----------- 1 file changed, 207 insertions(+), 166 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 235a64b4458c3..06ea6f18980e2 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -29,8 +29,16 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); +}; + struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; }; @@ -354,11 +362,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static bool check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - #define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { @@ -452,18 +455,33 @@ static int madvise_collapse_retry(void *p, unsigned long size) * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. */ -static void *alloc_hpage(void) +static void *alloc_hpage(struct mem_ops *ops) { - void *p; + void *p = ops->setup_area(1); - p = alloc_mapping(1); + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 1)) - success("OK"); - else - fail("Fail"); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); return p; } @@ -480,18 +498,40 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - bool expect) +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) { int ret; struct settings settings = *current_settings(); printf("%s...", msg); - /* Sanity check */ - if (!check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } /* * Prevent khugepaged interference and tests that MADV_COLLAPSE @@ -505,7 +545,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); - else if (check_huge(p, nr_hpages) != expect) + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) fail("Fail: check_huge()"); else success("OK"); @@ -513,14 +553,26 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, pop_settings(); } +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (!check_huge(p, 0)) { + if (!ops->check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } @@ -532,7 +584,7 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) printf("%s...", msg); while (timeout--) { - if (check_huge(p, nr_hpages)) + if (ops->check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -546,19 +598,20 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) } static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - bool expect) + struct mem_ops *ops, bool expect) { - if (wait_for_scan(msg, p, nr_hpages)) { + if (wait_for_scan(msg, p, nr_hpages, ops)) { if (expect) fail("Timeout"); else success("OK"); return; - } else if (check_huge(p, nr_hpages) == expect) { + } + + if (ops->check_huge(p, expect ? nr_hpages : 0)) success("OK"); - } else { + else fail("Fail"); - } } static void alloc_at_fault(void) @@ -572,7 +625,7 @@ static void alloc_at_fault(void) p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p, 1)) + if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else fail("Fail"); @@ -581,49 +634,48 @@ static void alloc_at_fault(void) madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge(p, 0)) + if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(struct collapse_context *c) +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) { void *p; int nr_hpages = 4; unsigned long size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - true); + ops, true); validate_memory(p, 0, size); - munmap(p, size); + ops->cleanup_area(p, size); } -static void collapse_empty(struct collapse_context *c) +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - c->collapse("Do not collapse empty PTE table", p, 1, false); - munmap(p, hpage_pmd_size); + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry(struct collapse_context *c) +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, page_size); + p = ops->setup_area(1); + ops->fault(p, 0, page_size); c->collapse("Collapse PTE table with single PTE entry present", p, - 1, true); - validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_none(struct collapse_context *c) +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; struct settings settings = *current_settings(); @@ -632,30 +684,30 @@ static void collapse_max_ptes_none(struct collapse_context *c) settings.khugepaged.max_ptes_none = max_ptes_none; push_settings(&settings); - p = alloc_mapping(1); + p = ops->setup_area(1); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - !c->enforce_pte_scan_limits); + ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, true); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } - - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } -static void collapse_swapin_single_pte(struct collapse_context *c) +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout one page..."); if (madvise(p, page_size, MADV_PAGEOUT)) { @@ -669,20 +721,21 @@ static void collapse_swapin_single_pte(struct collapse_context *c) goto out; } - c->collapse("Collapse with swapping in single PTE entry", p, 1, true); + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(struct collapse_context *c) +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { perror("madvise(MADV_PAGEOUT)"); @@ -695,12 +748,12 @@ static void collapse_max_ptes_swap(struct collapse_context *c) goto out; } - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { @@ -715,63 +768,65 @@ static void collapse_max_ptes_swap(struct collapse_context *c) } c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, true); + 1, ops, true); validate_memory(p, 0, hpage_pmd_size); } out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(struct collapse_context *c) +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_full_of_compound(struct collapse_context *c) +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - c->collapse("Collapse PTE table full of compound pages", p, 1, true); + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_compound_extreme(struct collapse_context *c) +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) { void *p; int i; - p = alloc_mapping(1); + p = ops->setup_area(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR, 1)) { + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -798,30 +853,30 @@ static void collapse_compound_extreme(struct collapse_context *c) } } - munmap(BASE_ADDR, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 0)) + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table full of different compound pages", p, 1, - true); + ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork(struct collapse_context *c) +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); printf("Allocate small page..."); - fill_memory(p, 0, page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -832,17 +887,17 @@ static void collapse_fork(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, page_size, 2 * page_size); + ops->fault(p, page_size, 2 * page_size); c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -850,27 +905,27 @@ static void collapse_fork(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork_compound(struct collapse_context *c) +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -878,20 +933,20 @@ static void collapse_fork_compound(struct collapse_context *c) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, 0, page_size); + ops->fault(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", - p, 1, true); + p, 1, ops, true); write_num("khugepaged/max_ptes_shared", current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -899,59 +954,59 @@ static void collapse_fork_compound(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_shared(struct collapse_context *c) +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, !c->enforce_pte_scan_limits); + 1, ops, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, true); + p, 1, ops, true); } validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -959,42 +1014,28 @@ static void collapse_max_ptes_shared(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void madvise_collapse_existing_thps(void) +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) { void *p; - int err; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); - printf("Collapse fully populated PTE table..."); - /* - * Note that we don't set MADV_HUGEPAGE here, which - * also tests that VM_HUGEPAGE isn't required for - * MADV_COLLAPSE in "madvise" mode. - */ - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) { - success("OK"); - printf("Re-collapse PMD-mapped hugepage"); - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - } else { - fail("Fail"); - } + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } int main(int argc, const char **argv) @@ -1034,37 +1075,37 @@ int main(int argc, const char **argv) c.collapse = &khugepaged_collapse; c.enforce_pte_scan_limits = true; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); } if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { printf("\n*** Testing context: madvise ***\n"); c.collapse = &madvise_collapse; c.enforce_pte_scan_limits = false; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); - madvise_collapse_existing_thps(); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); + madvise_collapse_existing_thps(&c, &anon_ops); } restore_settings(0); From e8ac6bbac5cf52fe792097434dfae96ba32e27a6 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:43 -0700 Subject: [PATCH 303/352] selftests/vm: add thp collapse file and tmpfs testing Add memory operations for file-backed and tmpfs memory. Call existing tests with these new memory operations to test collapse functionality of khugepaged and MADV_COLLAPSE on file-backed and tmpfs memory. Not all tests are reusable; for example, collapse_swapin_single_pte() which checks swap usage. Refactor test arguments. Usage is now: Usage: ./khugepaged [dir] : : : [all|khugepaged|madvise] : [all|anon|file] "file,all" mem_type requires [dir] argument "file,all" mem_type requires kernel built with CONFIG_READ_ONLY_THP_FOR_FS=y if [dir] is a (sub)directory of a tmpfs mount, tmpfs must be mounted with huge=madvise option for khugepaged tests to work Refactor calling tests to make it clear what collapse context / memory operations they support, but only invoke tests requested by user. Also log what test is being ran, and with what context / memory, to make test logs more human readable. A new test file is created and deleted for every test to ensure no pages remain in the page cache between tests (tests also may attempt to collapse different amount of memory). For file-backed memory where the file is stored on a block device, disable /sys/block//queue/read_ahead_kb so that pages don't find their way into the page cache without the tests faulting them in. Add file and shmem wrappers to vm_utils check for file and shmem hugepages in smaps. [zokeefe@google.com: fix "add thp collapse file and tmpfs testing" for tmpfs] Link: https://lkml.kernel.org/r/20220913212517.3163701-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-8-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-8-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 475 +++++++++++++++++++++--- tools/testing/selftests/vm/vm_util.c | 10 + tools/testing/selftests/vm/vm_util.h | 2 + 3 files changed, 431 insertions(+), 56 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 06ea6f18980e2..08de6141c2afe 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,7 +1,9 @@ #define _GNU_SOURCE +#include #include #include #include +#include #include #include #include @@ -11,12 +13,21 @@ #include #include +#include +#include +#include +#include + +#include "linux/magic.h" #include "vm_util.h" #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif @@ -28,20 +39,47 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; struct mem_ops { void *(*setup_area)(int nr_hpages); void (*cleanup_area)(void *p, unsigned long size); void (*fault)(void *p, unsigned long start, unsigned long end); bool (*check_huge)(void *addr, int nr_hpages); + const char *name; }; +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; + struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; }; +static struct file_info finfo; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -107,6 +145,7 @@ struct settings { enum shmem_enabled shmem_enabled; bool use_zero_page; struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; }; static struct settings saved_settings; @@ -125,6 +164,11 @@ static void fail(const char *msg) exit_status++; } +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + static int read_file(const char *path, char *buf, size_t buflen) { int fd; @@ -152,13 +196,19 @@ static int write_file(const char *path, const char *buf, size_t buflen) ssize_t numwritten; fd = open(path, O_WRONLY); - if (fd == -1) + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); return 0; + } numwritten = write(fd, buf, buflen - 1); close(fd); - if (numwritten < 1) + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); return 0; + } return (unsigned int) numwritten; } @@ -225,20 +275,11 @@ static void write_string(const char *name, const char *val) } } -static const unsigned long read_num(const char *name) +static const unsigned long _read_num(const char *path) { - char path[PATH_MAX]; char buf[21]; - int ret; - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - ret = read_file(path, buf, sizeof(buf)); - if (ret < 0) { + if (read_file(path, buf, sizeof(buf)) < 0) { perror("read_file(read_num)"); exit(EXIT_FAILURE); } @@ -246,10 +287,9 @@ static const unsigned long read_num(const char *name) return strtoul(buf, NULL, 10); } -static void write_num(const char *name, unsigned long num) +static const unsigned long read_num(const char *name) { char path[PATH_MAX]; - char buf[21]; int ret; ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); @@ -257,6 +297,12 @@ static void write_num(const char *name, unsigned long num) printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; sprintf(buf, "%ld", num); if (!write_file(path, buf, strlen(buf) + 1)) { @@ -265,6 +311,19 @@ static void write_num(const char *name, unsigned long num) } } +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + static void write_settings(struct settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; @@ -284,6 +343,10 @@ static void write_settings(struct settings *settings) write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); } #define MAX_SETTINGS_DEPTH 4 @@ -354,6 +417,10 @@ static void save_settings(void) .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), .pages_to_scan = read_num("khugepaged/pages_to_scan"), }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + success("OK"); signal(SIGTERM, restore_settings); @@ -362,7 +429,90 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -518,11 +668,91 @@ static bool anon_check_huge(void *addr, int nr_hpages) return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } -static struct mem_ops anon_ops = { +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, .fault = &anon_fault, .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", }; static void __madvise_collapse(const char *msg, char *p, int nr_hpages, @@ -538,6 +768,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, * ignores /sys/kernel/mm/transparent_hugepage/enabled */ settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; push_settings(&settings); /* Clear VM_NOHUGEPAGE */ @@ -608,12 +839,37 @@ static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, return; } + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + if (ops->check_huge(p, expect ? nr_hpages : 0)) success("OK"); else fail("Fail"); } +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + static void alloc_at_fault(void) { struct settings settings = *current_settings(); @@ -686,6 +942,13 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o p = ops->setup_area(1); + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, ops, !c->enforce_pte_scan_limits); @@ -698,6 +961,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } +skip: ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } @@ -781,6 +1045,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc p = alloc_hpage(ops); + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); @@ -792,6 +1063,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc c->collapse("Collapse PTE table with single PTE mapping compound page", p, 1, ops, true); validate_memory(p, 0, page_size); +skip: ops->cleanup_area(p, hpage_pmd_size); } @@ -1038,9 +1310,70 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\t\t: :\n"); + fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + int main(int argc, const char **argv) { - struct collapse_context c; struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, @@ -1051,8 +1384,20 @@ int main(int argc, const char **argv) .alloc_sleep_millisecs = 10, .scan_sleep_millisecs = 10, }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, }; - const char *tests = argc == 1 ? "all" : argv[1]; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); setbuf(stdout, NULL); @@ -1070,43 +1415,61 @@ int main(int argc, const char **argv) alloc_at_fault(); - if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { - printf("\n*** Testing context: khugepaged ***\n"); - c.collapse = &khugepaged_collapse; - c.enforce_pte_scan_limits = true; - - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - } - if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { - printf("\n*** Testing context: madvise ***\n"); - c.collapse = &madvise_collapse; - c.enforce_pte_scan_limits = false; - - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - madvise_collapse_existing_thps(&c, &anon_ops); - } +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); restore_settings(0); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 9dae51b8219f1..f11f8adda5218 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -114,3 +114,13 @@ bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) { return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 8434ea0c95cd7..5c35de454e08f 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -8,3 +8,5 @@ void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); From 0dc8c34caec279976222512cf73130941f1215d0 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:44 -0700 Subject: [PATCH 304/352] selftests/vm: add thp collapse shmem testing Add memory operations for shmem (memfd) memory, and reuse existing tests with the new memory operations. Shmem tests can be called with "shmem" mem_type, and shmem tests are ran with "all" mem_type as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-9-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-9-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 57 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 08de6141c2afe..eabbd2710096c 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -59,6 +59,7 @@ struct mem_ops { static struct mem_ops *file_ops; static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, @@ -739,6 +740,40 @@ static bool file_check_huge(void *addr, int nr_hpages) } } +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, @@ -755,6 +790,14 @@ static struct mem_ops __file_ops = { .name = "file", }; +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { @@ -1315,7 +1358,7 @@ static void usage(void) fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); fprintf(stderr, "\t\t: :\n"); fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); @@ -1357,10 +1400,13 @@ static void parse_test_type(int argc, const char **argv) if (!strcmp(buf, "all")) { file_ops = &__file_ops; anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; } else if (!strcmp(buf, "anon")) { anon_ops = &__anon_ops; } else if (!strcmp(buf, "file")) { file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; } else { usage(); } @@ -1377,7 +1423,7 @@ int main(int argc, const char **argv) struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, + .shmem_enabled = SHMEM_ADVISE, .use_zero_page = 0, .khugepaged = { .defrag = 1, @@ -1424,16 +1470,20 @@ int main(int argc, const char **argv) TEST(collapse_full, khugepaged_context, anon_ops); TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); TEST(collapse_full, madvise_context, anon_ops); TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); TEST(collapse_empty, khugepaged_context, anon_ops); TEST(collapse_empty, madvise_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); TEST(collapse_single_pte_entry, madvise_context, anon_ops); TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); TEST(collapse_max_ptes_none, khugepaged_context, file_ops); @@ -1447,8 +1497,10 @@ int main(int argc, const char **argv) TEST(collapse_full_of_compound, khugepaged_context, anon_ops); TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); TEST(collapse_full_of_compound, madvise_context, anon_ops); TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); TEST(collapse_compound_extreme, khugepaged_context, anon_ops); TEST(collapse_compound_extreme, madvise_context, anon_ops); @@ -1470,6 +1522,7 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); restore_settings(0); } From f8d4699d365ee2d6b7772129551970f6c1b9f89a Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:45 -0700 Subject: [PATCH 305/352] selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd This test tests that MADV_COLLAPSE acting on file/shmem memory for which (1) the file extent mapping by the memory is already a huge page in the page cache, and (2) the pmd mapping this memory in the target process is none. In practice, (1)+(2) is the state left over after khugepaged has successfully collapsed file/shmem memory for a target VMA, but the memory has not yet been refaulted. So, this test in-effect tests MADV_COLLAPSE racing with khugepaged to collapse the memory first. Link: https://lkml.kernel.org/r/20220907144521.3115321-10-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-10-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index eabbd2710096c..64126c8cd5612 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1353,6 +1353,33 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + static void usage(void) { fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); @@ -1524,5 +1551,8 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, file_ops); TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + restore_settings(0); } From 522f997d8378f28a17d61452baa7d3808baceae8 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:46 -0700 Subject: [PATCH 306/352] selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory Add :collapse mod to userfaultfd selftest. Currently this mod is only valid for "shmem" test type, but could be used for other test types. When provided, memory allocated by ->allocate_area() will be hugepage-aligned enforced to be hugepage-sized. userfaultf_minor_test, after the UFFD-registered mapping has been populated by UUFD minor fault handler, attempt to MADV_COLLAPSE the UFFD-registered mapping to collapse the memory into a pmd-mapped THP. This test is meant to be a functional test of what occurs during UFFD-driven live migration of VMs backed by huge tmpfs where, after a hugepage-sized region has been successfully migrated (in native page-sized chunks, to avoid latency of fetched a hugepage over the network), we want to reclaim previous VM performance by remapping it at the PMD level. Link: https://lkml.kernel.org/r/20220907144521.3115321-11-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-11-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/userfaultfd.c | 171 ++++++++++++++++++----- 2 files changed, 134 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c9c0996c122b4..c687533374e6b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -99,6 +99,7 @@ $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7be709d9eed07..74babdbc02e56 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -61,10 +61,11 @@ #include #include "../kselftest.h" +#include "vm_util.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) @@ -79,6 +80,8 @@ static int test_type; #define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + /* test using /dev/userfaultfd, instead of userfaultfd(2) */ static bool test_dev_userfaultfd; @@ -97,9 +100,10 @@ static int huge_fd; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; +static bool test_collapse; /* Userfaultfd test statistics */ struct uffd_stats { @@ -127,6 +131,8 @@ struct uffd_stats { #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" @@ -152,6 +158,8 @@ static void usage(void) "Supported mods:\n"); fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); fprintf(stderr, "\nExample test mod usage:\n"); fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); @@ -229,12 +237,10 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static void anon_allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area, bool is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (*alloc_area == MAP_FAILED) - err("mmap of anonymous memory failed"); } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -252,7 +258,7 @@ static void hugetlb_release_pages(char *rel_area) } } -static void hugetlb_allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; char **alloc_area_alias; @@ -262,7 +268,7 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), -1, 0); else @@ -270,9 +276,9 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); @@ -282,12 +288,12 @@ static void hugetlb_allocate_area(void **alloc_area) PROT_READ | PROT_WRITE, MAP_SHARED, huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } - if (*alloc_area == area_src) { + if (is_src) { alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -310,21 +316,36 @@ static void shmem_release_pages(char *rel_area) err("madvise(MADV_REMOVE) failed"); } -static void shmem_allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; - bool is_src = alloc_area == (void **)&area_src; - unsigned long offset = is_src ? 0 : nr_pages * page_size; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (area_alias == MAP_FAILED) err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); if (is_src) area_src_alias = area_alias; @@ -337,28 +358,39 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) *start = (unsigned long)area_dst_alias + offset; } +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + struct uffd_test_ops { - void (*allocate_area)(void **alloc_area); + void (*allocate_area)(void **alloc_area, bool is_src); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); }; static struct uffd_test_ops anon_uffd_test_ops = { .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops shmem_uffd_test_ops = { .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops *uffd_test_ops; @@ -478,6 +510,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); } static void uffd_test_ctx_init(uint64_t features) @@ -486,8 +519,8 @@ static void uffd_test_ctx_init(uint64_t features) uffd_test_ctx_clear(); - uffd_test_ops->allocate_area((void **)&area_src); - uffd_test_ops->allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); userfaultfd_open(&features); @@ -804,6 +837,7 @@ static void *uffd_poll_thread(void *arg) err("remove failure"); break; case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ area_dst = (char *)(unsigned long)msg.arg.remap.to; break; } @@ -1256,13 +1290,30 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + static int userfaultfd_minor_test(void) { - struct uffdio_register uffdio_register; unsigned long p; + struct uffdio_register uffdio_register; pthread_t uffd_mon; - uint8_t expected_byte; - void *expected_page; char c; struct uffd_stats stats = { 0 }; @@ -1301,17 +1352,7 @@ static int userfaultfd_minor_test(void) * fault. uffd_poll_thread will resolve the fault by bit-flipping the * page's contents, and then issuing a CONTINUE ioctl. */ - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (p = 0; p < nr_pages; ++p) { - expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, area_dst_alias + (p * page_size), - page_size)) - err("unexpected page contents after minor fault"); - } + check_memory_contents(area_dst_alias); if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); @@ -1320,6 +1361,23 @@ static int userfaultfd_minor_test(void) uffd_stats_report(&stats, 1); + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } @@ -1656,6 +1714,8 @@ static void parse_test_type_arg(const char *raw_type) test_dev_userfaultfd = true; else if (!strcmp(token, "syscall")) test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; else err("unrecognized test mod '%s'", token); } @@ -1663,8 +1723,11 @@ static void parse_test_type_arg(const char *raw_type) if (!test_type) err("failed to parse test type argument: '%s'", raw_type); + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + if (test_type == TEST_HUGETLB) - page_size = default_huge_page_size(); + page_size = hpage_size; else page_size = sysconf(_SC_PAGE_SIZE); @@ -1702,6 +1765,8 @@ static void sigalrm(int sig) int main(int argc, char **argv) { + size_t bytes; + if (argc < 4) usage(); @@ -1709,11 +1774,41 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); + hpage_size = default_huge_page_size(); parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / - nr_cpus; + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; if (!nr_pages_per_cpu) { _err("invalid MiB"); usage(); From 80824086a50e5de5147b472529745b31a641cd82 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 19:09:35 +0800 Subject: [PATCH 307/352] mm: remove unused inline functions from include/linux/mm_inline.h Remove the following unused inline functions from mm_inline.h: 1. All uses of add_page_to_lru_list_tail() have been removed since commit 7a3dbfe8a52b ("mm/swap: convert lru_deactivate_file to a folio_batch"), and it can be replaced by lruvec_add_folio_tail(). 2. All uses of __clear_page_lru_flags() have been removed since commit 188e8caee968 ("mm/swap: convert __page_cache_release() to use a folio"), and it can be replaced by __folio_clear_lru_flags(). They are useless, so remove them. Link: https://lkml.kernel.org/r/20220922110935.1495099-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4949eda9a9a2a..e8ed225d8f7ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) __folio_clear_unevictable(folio); } -static __always_inline void __clear_page_lru_flags(struct page *page) -{ - __folio_clear_lru_flags(page_folio(page)); -} - /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. @@ -348,12 +343,6 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) list_add_tail(&folio->lru, &lruvec->lists[lru]); } -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec) -{ - lruvec_add_folio_tail(lruvec, page_folio(page)); -} - static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { From c85cf22427a1738ea7ab941e687b15a326748326 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 22 Sep 2022 10:19:29 +0800 Subject: [PATCH 308/352] mm/hugetlb: add available_huge_pages() func In hugetlb.c there are several places which compare the values of 'h->free_huge_pages' and 'h->resv_huge_pages', it looks a bit messy, so add a new available_huge_pages() function to do these. Link: https://lkml.kernel.org/r/20220922021929.98961-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 12370be17fc8f..60e077ce6ca73 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1191,6 +1191,11 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, return NULL; } +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1207,12 +1212,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && - h->free_huge_pages - h->resv_huge_pages == 0) + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -2124,7 +2128,7 @@ int dissolve_free_huge_page(struct page *page) if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) + if (!available_huge_pages(h)) goto out; /* @@ -2311,7 +2315,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) { + if (available_huge_pages(h)) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); From c8baf61d65de3ca0980f577e69b416d2ac31a56a Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 20 Sep 2022 09:22:05 +0800 Subject: [PATCH 309/352] mm/secretmem: remove reduntant return value The return value @ret is always 0, so remove it and return 0 directly. Link: https://lkml.kernel.org/r/20220920012205.246217-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/secretmem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 6a44efb673b2c..04c3ac9448a18 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -278,10 +278,8 @@ static struct file_system_type secretmem_fs = { static int __init secretmem_init(void) { - int ret = 0; - if (!secretmem_enable) - return ret; + return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) @@ -290,6 +288,6 @@ static int __init secretmem_init(void) /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return ret; + return 0; } fs_initcall(secretmem_init); From 03575d19aef493f12b106a42a6de6cc3dd58f499 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:01 -0400 Subject: [PATCH 310/352] mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled Patch series "memcg swap fix & cleanups". This patch (of 4): Since commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), the cgroup swap arrays are used to track memory ownership at the time of swap readahead and swapoff, even if swap space *accounting* has been turned off by the user via swapaccount=0 (which sets cgroup_memory_noswap). However, the patch was overzealous: by simply dropping the cgroup_memory_noswap conditionals in the swapon, swapoff and uncharge path, it caused the cgroup arrays being allocated even when the memory controller as a whole is disabled. This is a waste of that memory. Restore mem_cgroup_disabled() checks, implied previously by cgroup_memory_noswap, in the swapon, swapoff, and swap_entry_free callbacks. Link: https://lkml.kernel.org/r/20220926135704.400818-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20220926135704.400818-2-hannes@cmpxchg.org Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") Signed-off-by: Johannes Weiner Reported-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +++ mm/swap_cgroup.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b74bbdc26596..9e3c010ca676c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7459,6 +7459,9 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 5a9442979a185..db6c4a26cf593 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return 0; + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array = vcalloc(length, sizeof(void *)); @@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return; + mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; From 7ada5b2d2767d1106378d5a1b0be931a6dde229b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:02 -0400 Subject: [PATCH 311/352] mm: memcontrol: deprecate swapaccounting=0 mode The swapaccounting= commandline option already does very little today. To close a trivial containment failure case, the swap ownership tracking part of the swap controller has recently become mandatory (see commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") for details), which makes up the majority of the work during swapout, swapin, and the swap slot map. The only thing left under this flag is the page_counter operations and the visibility of the swap control files in the first place, which are rather meager savings. There also aren't many scenarios, if any, where controlling the memory of a cgroup while allowing it unlimited access to a global swap space is a workable resource isolation strategy. On the other hand, there have been several bugs and confusion around the many possible swap controller states (cgroup1 vs cgroup2 behavior, memory accounting without swap accounting, memcg runtime disabled). This puts the maintenance overhead of retaining the toggle above its practical benefits. Deprecate it. Link: https://lkml.kernel.org/r/20220926135704.400818-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 --- mm/memcontrol.c | 50 ++++--------------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3b95f65bafe27..99a13f2be2ef6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6036,12 +6036,6 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swapaccount= [KNL] - Format: [0|1] - Enable accounting of swap in memory resource - controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) - swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { [,] | force | noforce } -- Number of I/O TLB slabs diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e3c010ca676c..4be1b48b96596 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,22 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __initdata; - -static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); -static inline bool memcg_swap_enabled(void) -{ - return static_branch_likely(&memcg_swap_enabled_key); -} -#else -static inline bool memcg_swap_enabled(void) -{ - return false; -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -111,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7379,7 +7363,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg_swap_enabled() && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7431,7 +7415,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7466,7 +7450,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { + if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7482,7 +7466,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7499,7 +7483,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; memcg = folio_memcg(folio); @@ -7519,10 +7503,9 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - bool res; - - if (!kstrtobool(s, &res)) - cgroup_memory_noswap = !res; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7791,24 +7774,11 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; - static_branch_enable(&memcg_swap_enabled_key); - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -7816,6 +7786,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ From 892704714f365dc4385b50241e76fafbb391d46c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:03 -0400 Subject: [PATCH 312/352] mm: memcontrol: use do_memsw_account() in a few more places It's slightly more descriptive and consistent with other places that distinguish cgroup1's combined memory+swap accounting scheme from cgroup2's dedicated swap accounting. Link: https://lkml.kernel.org/r/20220926135704.400818-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be1b48b96596..76bb0a18a2f3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1667,17 +1667,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -7334,7 +7334,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7399,7 +7399,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7451,10 +7451,10 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); From 17f02f21d6e1cc0d9abe14659761c471fdae73ea Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:04 -0400 Subject: [PATCH 313/352] mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP. Update the sites accordingly and drop the symbol. [ While touching the docs, remove two references to CONFIG_MEMCG_KMEM, which hasn't been a user-visible symbol for over half a decade. ] Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 +--- arch/mips/configs/db1xxx_defconfig | 1 - arch/mips/configs/generic_defconfig | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - arch/sh/configs/urquell_defconfig | 1 - include/linux/swap.h | 2 +- include/linux/swap_cgroup.h | 4 ++-- init/Kconfig | 5 ----- mm/Makefile | 4 +++- mm/memcontrol.c | 6 +++--- tools/testing/selftests/cgroup/config | 1 - 13 files changed, 10 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 2cc502a75ef64..5b86245450bdc 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by lruvec->lru_lock; PG_lru bit of page->flags is cleared before isolating a page from its LRU under lruvec->lru_lock. -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +2.7 Kernel Memory Extension ----------------------------------------------- With the Kernel memory extension, the Memory Controller is able to limit @@ -386,8 +386,6 @@ U != 0, K >= U: a. Enable CONFIG_CGROUPS b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) ------------------------------------------------------------------- diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig index b8bd663009969..83cbdecb27e6a 100644 --- a/arch/mips/configs/db1xxx_defconfig +++ b/arch/mips/configs/db1xxx_defconfig @@ -9,7 +9,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf0..48e4e251779b6 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -3,7 +3,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 49f49c2639350..4acca52634048 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -17,7 +17,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index b571d084c148b..fead14ebb1fce 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -16,7 +16,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index a8662b6927ec7..97b7356639ed8 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -16,7 +16,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe02..be478f3148f2d 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -14,7 +14,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y diff --git a/include/linux/swap.h b/include/linux/swap.h index fc8d98660326f..a18cf4b7c724c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -666,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) cgroup_throttle_swaprate(&folio->page, gfp); } -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index a12dd1c3966c9..ae73a87775b3a 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -4,7 +4,7 @@ #include -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); @@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_MEMCG_SWAP */ +#endif #endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/init/Kconfig b/init/Kconfig index 532362fcfe31f..7d86cf6b3012d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -958,11 +958,6 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup. -config MEMCG_SWAP - bool - depends on MEMCG && SWAP - default y - config MEMCG_KMEM bool depends on MEMCG && !SLOB diff --git a/mm/Makefile b/mm/Makefile index cc23b00525848..8e105e5b3e293 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -98,7 +98,9 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o +ifdef CONFIG_SWAP +obj-$(CONFIG_MEMCG) += swap_cgroup.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76bb0a18a2f3d..61e05fc281fb9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3423,7 +3423,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -7296,7 +7296,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7788,4 +7788,4 @@ static int __init mem_cgroup_swap_init(void) } subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 84fe884fad867..97d549ee894fa 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -4,5 +4,4 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y CONFIG_MEMCG_KMEM=y -CONFIG_MEMCG_SWAP=y CONFIG_PAGE_COUNTER=y From 546f08cb8c466b8500acf9eb4a7ab7fb8a7a2ac3 Mon Sep 17 00:00:00 2001 From: Lin Yujun Date: Wed, 14 Sep 2022 11:37:55 +0800 Subject: [PATCH 314/352] ACPI: HMAT: release platform device in case of platform_device_add_data() failure The platform device is not released when platform_device_add_data() fails. And platform_device_put() performs one more pointer check than put_device() to check for errors in the 'pdev' pointer. Use platform_device_put() to release platform device in platform_device_add()/platform_device_add_data()/ platform_device_add_resources() error case. Link: https://lkml.kernel.org/r/20220914033755.99924-1-linyujun809@huawei.com Fixes: c01044cc8191 ("ACPI: HMAT: refactor hmat_register_target_device to hmem_register_device") Signed-off-by: Lin Yujun Cc: Dan Williams Cc: Dave Jiang Cc: Joao Martins Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/dax/hmem/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index cb6401c9e9a4f..f87ae005431a5 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -47,7 +47,7 @@ void hmem_register_device(int target_nid, struct resource *r) rc = platform_device_add_data(pdev, &info, sizeof(info)); if (rc < 0) { pr_err("hmem memregion_info allocation failure for %pr\n", &res); - goto out_pdev; + goto out_resource; } rc = platform_device_add_resources(pdev, &res, 1); @@ -65,7 +65,7 @@ void hmem_register_device(int target_nid, struct resource *r) return; out_resource: - put_device(&pdev->dev); + platform_device_put(pdev); out_pdev: memregion_free(id); } From 016f66a9251cc3fa83f00dfd920728242bb1ad2c Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 4 Mar 2022 17:12:15 +0800 Subject: [PATCH 315/352] lib/test_meminit: add checks for the allocation functions alloc_pages(), kmalloc() and vmalloc() are all memory allocation functions which can return NULL when some internal memory failures happen. So it is better to check the return of them to catch the failure in time for better test them. Link: https://lkml.kernel.org/r/tencent_D44A49FFB420EDCCBFB9221C8D14DFE12908@qq.com Signed-off-by: Xiaoke Wang Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- lib/test_meminit.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index c95db11a69064..60e1984c060fa 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -67,17 +67,24 @@ static int __init do_alloc_pages_order(int order, int *total_failures) size_t size = PAGE_SIZE << order; page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); fill_with_garbage(buf, size); __free_pages(page, order); page = alloc_pages(GFP_KERNEL, order); + if (!page) + goto err; buf = page_address(page); if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); __free_pages(page, order); return 1; +err: + (*total_failures)++; + return 1; } /* Test the page allocator by calling alloc_pages with different orders. */ @@ -100,15 +107,22 @@ static int __init do_kmalloc_size(size_t size, int *total_failures) void *buf; buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; fill_with_garbage(buf, size); kfree(buf); buf = kmalloc(size, GFP_KERNEL); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); kfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test vmalloc() with given parameters. */ @@ -117,15 +131,22 @@ static int __init do_vmalloc_size(size_t size, int *total_failures) void *buf; buf = vmalloc(size); + if (!buf) + goto err; fill_with_garbage(buf, size); vfree(buf); buf = vmalloc(size); + if (!buf) + goto err; if (count_nonzero_bytes(buf, size)) (*total_failures)++; fill_with_garbage(buf, size); vfree(buf); return 1; +err: + (*total_failures)++; + return 1; } /* Test kmalloc()/vmalloc() by allocating objects of different sizes. */ From a3ce7684ea097e9964cfb7d267eff021bd19fe91 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 26 Sep 2022 20:08:47 +0200 Subject: [PATCH 316/352] kasan: fix array-bounds warnings in tests GCC's -Warray-bounds option detects out-of-bounds accesses to statically-sized allocations in krealloc out-of-bounds tests. Use OPTIMIZER_HIDE_VAR to suppress the warning. Also change kmalloc_memmove_invalid_size to use OPTIMIZER_HIDE_VAR instead of a volatile variable. Link: https://lkml.kernel.org/r/e94399242d32e00bba6fd0d9ec4c897f188128e8.1664215688.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index f25692def7813..57e4c72aa8bd2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* All offsets up to size2 must be accessible. */ ptr2[size1 - 1] = 'x'; ptr2[size1] = 'x'; @@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* Must be accessible for all modes. */ ptr2[size2 - 1] = 'x'; @@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = size; + size_t invalid_size = size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); From dd406180a11325c4554a016530eb26c9534179b4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 2 Aug 2022 12:28:11 -0400 Subject: [PATCH 317/352] mm: vmscan: fix extreme overreclaim and swap floods During proactive reclaim, we sometimes observe severe overreclaim, with several thousand times more pages reclaimed than requested. This trace was obtained from shrink_lruvec() during such an instance: prio:0 anon_cost:1141521 file_cost:7767 nr_reclaimed:4387406 nr_to_reclaim:1047 (or_factor:4190) nr=[7161123 345 578 1111] While he reclaimer requested 4M, vmscan reclaimed close to 16G, most of it by swapping. These requests take over a minute, during which the write() to memory.reclaim is unkillably stuck inside the kernel. Digging into the source, this is caused by the proportional reclaim bailout logic. This code tries to resolve a fundamental conflict: to reclaim roughly what was requested, while also aging all LRUs fairly and in accordance to their size, swappiness, refault rates etc. The way it attempts fairness is that once the reclaim goal has been reached, it stops scanning the LRUs with the smaller remaining scan targets, and adjusts the remainder of the bigger LRUs according to how much of the smaller LRUs was scanned. It then finishes scanning that remainder regardless of the reclaim goal. This works fine if priority levels are low and the LRU lists are comparable in size. However, in this instance, the cgroup that is targeted by proactive reclaim has almost no files left - they've already been squeezed out by proactive reclaim earlier - and the remaining anon pages are hot. Anon rotations cause the priority level to drop to 0, which results in reclaim targeting all of anon (a lot) and all of file (almost nothing). By the time reclaim decides to bail, it has scanned most or all of the file target, and therefor must also scan most or all of the enormous anon target. This target is thousands of times larger than the reclaim goal, thus causing the overreclaim. The bailout code hasn't changed in years, why is this failing now? The most likely explanations are two other recent changes in anon reclaim: 1. Before the series starting with commit 5df741963d52 ("mm: fix LRU balancing effect of new transparent huge pages"), the VM was overall relatively reluctant to swap at all, even if swap was configured. This means the LRU balancing code didn't come into play as often as it does now, and mostly in high pressure situations where pronounced swap activity wouldn't be as surprising. 2. For historic reasons, shrink_lruvec() loops on the scan targets of all LRU lists except the active anon one, meaning it would bail if the only remaining pages to scan were active anon - even if there were a lot of them. Before the series starting with commit ccc5dc67340c ("mm/vmscan: make active/inactive ratio as 1:1 for anon lru"), most anon pages would live on the active LRU; the inactive one would contain only a handful of preselected reclaim candidates. After the series, anon gets aged similarly to file, and the inactive list is the default for new anon pages as well, making it often the much bigger list. As a result, the VM is now more likely to actually finish large anon targets than before. Change the code such that only one SWAP_CLUSTER_MAX-sized nudge toward the larger LRU lists is made before bailing out on a met reclaim goal. This fixes the extreme overreclaim problem. Fairness is more subtle and harder to evaluate. No obvious misbehavior was observed on the test workload, in any case. Conceptually, fairness should primarily be a cumulative effect from regular, lower priority scans. Once the VM is in trouble and needs to escalate scan targets to make forward progress, fairness needs to take a backseat. This is also acknowledged by the myriad exceptions in get_scan_count(). This patch makes fairness decrease gradually, as it keeps fairness work static over increasing priority levels with growing scan targets. This should make more sense - although we may have to re-visit the exact values. Link: https://lkml.kernel.org/r/20220802162811.39216-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c5a4bff11da69..a8fd6300fa7e7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5847,8 +5847,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + bool proportional_reclaim; struct blk_plug plug; - bool scan_adjusted; if (lru_gen_enabled()) { lru_gen_shrink_lruvec(lruvec, sc); @@ -5871,8 +5871,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ - scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && - sc->priority == DEF_PRIORITY); + proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -5892,7 +5892,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* @@ -5943,8 +5943,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); - - scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; From 42dd3a50c4489eda8cfffb014d80700e2efbc2a3 Mon Sep 17 00:00:00 2001 From: Gang He Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 318/352] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 961d1cf54388e..adc8b81e59bd3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2490,6 +2490,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2598,13 +2599,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2761,11 +2765,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2779,14 +2783,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2799,7 +2796,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2863,7 +2860,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2887,10 +2884,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e0..03a2c526e2c1b 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1358981e80a36..b2ebaec86ddca 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4222,7 +4222,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4230,7 +4230,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4276,13 +4276,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4300,6 +4302,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d..3f23e3a5018ce 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f1..b27fd8ba00196 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From 1b2855130cbd4e8b90f22cb55e5e6ed4d1ca9540 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 319/352] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index adc8b81e59bd3..5e2b04dec2e6a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -454,8 +454,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -599,6 +603,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2028,8 +2034,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From 43baeae1b8be30755f193fd2c359398858fada09 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 320/352] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5e2b04dec2e6a..b98e1531dfcb5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -641,7 +641,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 842ac8e86cb44a539768ead8ad8eb48ca3e71e1e Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Mon, 26 Sep 2022 19:59:25 -0700 Subject: [PATCH 321/352] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index a45f9eca40af0..24f9badf78b6d 100644 --- a/init/main.c +++ b/init/main.c @@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -232,13 +232,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -475,7 +475,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -504,7 +504,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -529,7 +530,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -729,7 +731,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1348,8 +1351,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1500,7 +1505,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From 78e26af38ee499fc53d2597a5e581a62e46a2c7d Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 11 Jun 2022 21:06:34 +0800 Subject: [PATCH 322/352] lib/debugobjects: fix stat count and optimize debug_objects_mem_init. 1. Var debug_objects_allocated tracks valid kmem_cache_alloc calls, so track it in debug_objects_replace_static_objects. Do similar things in object_cpu_offline. 2. In debug_objects_mem_init, there is no need to call function cpuhp_setup_state_nocalls when debug_objects_enabled = 0 (out of memory). Link: https://lkml.kernel.org/r/20220611130634.99741-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Thomas Gleixner Cc: Christoph Hellwig Cc: Kees Cook Cc: Waiman Long Signed-off-by: Andrew Morton --- lib/debugobjects.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 337d797a71416..6f8e5dd1dcd0c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu) struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; + unsigned long flags; /* Remote access is safe as the CPU is dead already */ percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); @@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu) hlist_del(&obj->node); kmem_cache_free(obj_cache, obj); } + + raw_spin_lock_irqsave(&pool_lock, flags); + obj_pool_used -= percpu_pool->obj_free; + debug_objects_freed += percpu_pool->obj_free; + raw_spin_unlock_irqrestore(&pool_lock, flags); + percpu_pool->obj_free = 0; return 0; @@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void) hlist_add_head(&obj->node, &objects); } + debug_objects_allocated += i; + /* * debug_objects_mem_init() is now called early that only one CPU is up * and interrupts have been disabled, so it is safe to replace the @@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; kmem_cache_destroy(obj_cache); pr_warn("out of memory.\n"); + return; } else debug_objects_selftest(); From 3918f174df51161af1948c4101f639125d8ac37d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:29 +0000 Subject: [PATCH 323/352] fault-injection: allow stacktrace filter for x86-64 This patchset allow fault injection to run on x86_64 and makes stacktrace filter work as expected. With this, we can test a device driver module with fault injection more easily. This patch (of 4): FAULT_INJECTION_STACKTRACE_FILTER option was apparently disallowed on x86_64 because of problems with the stack unwinder: commit 6d690dcac92a84f98fd774862628ff871b713660 Author: Akinobu Mita Date: Sat May 12 10:36:53 2007 -0700 fault injection: disable stacktrace filter for x86-64 However, there is no problems whatsoever with this today. Let's allow it again. Link: https://lkml.kernel.org/r/20220817080332.1052710-1-weiyongjun1@huawei.com Link: https://lkml.kernel.org/r/20220817080332.1052710-2-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Nathan Chancellor Cc: Peter Zijlstra Cc: Kees Cook Cc: Nick Desaulniers Cc: Josh Poimboeuf Cc: Dan Williams Cc: Miguel Ojeda Cc: Isabella Basso Cc: Vlastimil Babka Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index bcbe60d6c80c1..e687336562a24 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1944,7 +1944,6 @@ config FAIL_SUNRPC config FAULT_INJECTION_STACKTRACE_FILTER bool "stacktrace filter for fault-injection capabilities" depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT - depends on !X86_64 select STACKTRACE depends on FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86 help From df76fa47b1b5103af6ed8c1b8b7944d6753651d4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:30 +0000 Subject: [PATCH 324/352] fault-injection: skip stacktrace filtering by default If FAULT_INJECTION_STACKTRACE_FILTER is enabled, the depth is default to 32. This means fail_stacktrace() will iter each entry's stacktrace, even if filter is not configured. This patch changes to quick return from fail_stacktrace() if stacktrace filter is not set. Link: https://lkml.kernel.org/r/20220817080332.1052710-3-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 423784d9c058e..515fc5aaf0323 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -74,7 +74,7 @@ static bool fail_stacktrace(struct fault_attr *attr) int n, nr_entries; bool found = (attr->require_start == 0 && attr->require_end == ULONG_MAX); - if (depth == 0) + if (depth == 0 || (found && !attr->reject_start && !attr->reject_end)) return found; nr_entries = stack_trace_save(entries, depth, 1); From 9dca55cf1ea186a58bfffc9e30300a433bd5c8b9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:31 +0000 Subject: [PATCH 325/352] fault-injection: make some stack filter attrs more readable Attributes of stack filter are show as unsigned decimal, such as 'require-start', 'require-end'. This patch change to show them as unsigned hexadecimal for more readable. Before: $ echo 0xffffffffc0257000 > /sys/kernel/debug/failslab/require-start $ cat /sys/kernel/debug/failslab/require-start 18446744072638263296 After: $ echo 0xffffffffc0257000 > /sys/kernel/debug/failslab/require-start $ cat /sys/kernel/debug/failslab/require-start 0xffffffffc0257000 Link: https://lkml.kernel.org/r/20220817080332.1052710-4-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 515fc5aaf0323..deca05e7c9b30 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -179,6 +179,14 @@ static void debugfs_create_ul(const char *name, umode_t mode, #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER +DEFINE_SIMPLE_ATTRIBUTE(fops_xl, debugfs_ul_get, debugfs_ul_set, "0x%llx\n"); + +static void debugfs_create_xl(const char *name, umode_t mode, + struct dentry *parent, unsigned long *value) +{ + debugfs_create_file(name, mode, parent, value, &fops_xl); +} + static int debugfs_stacktrace_depth_set(void *data, u64 val) { *(unsigned long *)data = @@ -223,10 +231,10 @@ struct dentry *fault_create_debugfs_attr(const char *name, #ifdef CONFIG_FAULT_INJECTION_STACKTRACE_FILTER debugfs_create_stacktrace_depth("stacktrace-depth", mode, dir, &attr->stacktrace_depth); - debugfs_create_ul("require-start", mode, dir, &attr->require_start); - debugfs_create_ul("require-end", mode, dir, &attr->require_end); - debugfs_create_ul("reject-start", mode, dir, &attr->reject_start); - debugfs_create_ul("reject-end", mode, dir, &attr->reject_end); + debugfs_create_xl("require-start", mode, dir, &attr->require_start); + debugfs_create_xl("require-end", mode, dir, &attr->require_end); + debugfs_create_xl("reject-start", mode, dir, &attr->reject_start); + debugfs_create_xl("reject-end", mode, dir, &attr->reject_end); #endif /* CONFIG_FAULT_INJECTION_STACKTRACE_FILTER */ attr->dname = dget(dir); From f5c0386f47bd73c05ebd8d3d5d3099676e9d4866 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Aug 2022 08:03:32 +0000 Subject: [PATCH 326/352] fault-injection: make stacktrace filter works as expected stacktrace filter is checked after others, such as fail-nth, interval and probability. This make it doesn't work well as expected. Fix to running stacktrace filter before other filters. It will speed up fault inject testing for driver modules. Link: https://lkml.kernel.org/r/20220817080332.1052710-5-weiyongjun1@huawei.com Signed-off-by: Wei Yongjun Cc: Akinobu Mita Cc: Dan Williams Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/fault-inject.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/fault-inject.c b/lib/fault-inject.c index deca05e7c9b30..9dd1dd1d26105 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -105,10 +105,16 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + bool stack_checked = false; + if (in_task()) { unsigned int fail_nth = READ_ONCE(current->fail_nth); if (fail_nth) { + if (!fail_stacktrace(attr)) + return false; + + stack_checked = true; fail_nth--; WRITE_ONCE(current->fail_nth, fail_nth); if (!fail_nth) @@ -128,6 +134,9 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (atomic_read(&attr->times) == 0) return false; + if (!stack_checked && !fail_stacktrace(attr)) + return false; + if (atomic_read(&attr->space) > size) { atomic_sub(size, &attr->space); return false; @@ -142,9 +151,6 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (attr->probability <= prandom_u32() % 100) return false; - if (!fail_stacktrace(attr)) - return false; - fail: fail_dump(attr); From 6da70287dc67726c305c90dbccc9b5ef64d6fd51 Mon Sep 17 00:00:00 2001 From: Dmitrii Bundin Date: Mon, 15 Aug 2022 04:33:17 +0300 Subject: [PATCH 327/352] kbuild: add debug level and macro defs options Add config options to control debug info level and producing of macro definitions for GCC/Clang. Option DEBUG_INFO_LEVEL is responsible for controlling debug info level. Before GCC 11 and Clang 12 -gsplit-dwarf implicitly uses -g2. To provide a way to override the setting with, e.g. -g1, DEBUG_INFO_LEVEL is set independently from DEBUG_INFO_SPLIT. Option DEBUG_MACRO_DEFINITIONS is responsible for controlling inclusion of macro definitions. Since Clang uses -fdebug-macro to control if macro definitions are produced which is different from GCC, provides a compiler-specific way of handling macro inclusion. The option is handled after DEBUG_INFO_LEVEL since -g3 -g2 implies -g2, but -g2 -g3 implies -g3 and GCC uses -g3 to produce macro definitions. Link: https://lkml.kernel.org/r/20220815013317.26121-1-dmitrii.bundin.a@gmail.com Signed-off-by: Dmitrii Bundin Cc: Dan Williams Cc: Eric Dumazet Cc: Fangrui Song Cc: Isabella Basso Cc: Josh Poimboeuf Cc: Kees Cook Cc: Masahiro Yamada Cc: Michal Marek Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Rasmus Villemoes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 20 ++++++++++++++++++++ scripts/Makefile.debug | 12 ++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e687336562a24..a6155053612e7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -304,6 +304,26 @@ config DEBUG_INFO_REDUCED DEBUG_INFO build and compile times are reduced too. Only works with newer gcc versions. +config DEBUG_INFO_LEVEL + int "Debug info level" + range 1 3 + default "2" + help + Sets the level of how much debug information to generate (-glevel). + Level 1 produces minimal debug information without including information + about local variables. Level 3 includes extra information like macro + definitions. Setting up level 3 will require significantly more disk + space and increase built time. + +config DEBUG_MACRO_DEFINITIONS + bool "Add macro definitions to debug info" + default n + help + Generates macro definitions to provide a way to expand macros right + in the debugging session. This information can be used with macro expand, + info macro in gdb. This option is equivalent to setting -g3 in GCC and + -fdebug-macro in Clang. + config DEBUG_INFO_COMPRESSED bool "Compressed debugging information" depends on $(cc-option,-gz=zlib) diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug index 9f39b0130551f..29cd04234e75e 100644 --- a/scripts/Makefile.debug +++ b/scripts/Makefile.debug @@ -2,8 +2,6 @@ DEBUG_CFLAGS := ifdef CONFIG_DEBUG_INFO_SPLIT DEBUG_CFLAGS += -gsplit-dwarf -else -DEBUG_CFLAGS += -g endif ifndef CONFIG_AS_IS_LLVM @@ -16,6 +14,16 @@ dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5 DEBUG_CFLAGS += -gdwarf-$(dwarf-version-y) endif +DEBUG_CFLAGS += -g$(CONFIG_DEBUG_INFO_LEVEL) +ifdef CONFIG_DEBUG_MACRO_DEFINITIONS +ifdef CONFIG_CC_IS_GCC +DEBUG_CFLAGS += -g3 +endif +ifdef CONFIG_CC_IS_CLANG +DEBUG_CFLAGS += -fdebug-macro +endif +endif + ifdef CONFIG_DEBUG_INFO_REDUCED DEBUG_CFLAGS += -fno-var-tracking ifdef CONFIG_CC_IS_GCC From 941cd32443e0ed463936a5a67e49ea58a9a3e41e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 3 Sep 2022 00:59:36 +0100 Subject: [PATCH 328/352] ocfs2: replace zero-length arrays with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length array declarations in a couple of structures and unions with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for a flexible-array member in a union and as only member in a structure. Also, this addresses multiple warnings reported when building with Clang-15 and -Wzero-length-array. Lastly, this will also help memcpy (in a coming hardening update) execute proper bounds-checking on variable length object i_symlink at fs/ocfs2/namei.c:1973: fs/ocfs2/namei.c: 1973 memcpy((char *) fe->id2.i_symlink, symname, l); Link: https://github.com/KSPP/linux/issues/21 Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/197 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Link: https://lkml.kernel.org/r/YxKY6O2hmdwNh8r8@work Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ocfs2_fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 638d875eccc7d..7aebdbf5cc0a5 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -527,7 +527,7 @@ struct ocfs2_extent_block * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { -/*00*/ __le16 sm_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. @@ -548,7 +548,7 @@ struct ocfs2_extended_slot { * i_size. */ struct ocfs2_slot_map_extended { -/*00*/ struct ocfs2_extended_slot se_slots[0]; +/*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) @@ -727,7 +727,7 @@ struct ocfs2_dinode { struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; - __u8 i_symlink[0]; + DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; @@ -892,7 +892,7 @@ struct ocfs2_group_desc /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { - __u8 bg_bitmap[0]; + DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when From 0950831145a4f58905cf86d2cd3f6a2870022d38 Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 3 Sep 2022 08:43:30 +0200 Subject: [PATCH 329/352] core_pattern: add CPU specifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Statistically, in a large deployment regular segfaults may indicate a CPU issue. Currently, it is not possible to find out what CPU the segfault happened on. There are at least two attempts to improve segfault logging with this regard, but they do not help in case the logs rotate. Hence, lets make sure it is possible to permanently record a CPU the task ran on using a new core_pattern specifier. Link: https://lkml.kernel.org/r/20220903064330.20772-1-oleksandr@redhat.com Signed-off-by: Oleksandr Natalenko Suggested-by: Renaud Métrich Reviewed-by: Oleg Nesterov Cc: Alexander Viro Cc: "Eric W . Biederman" Cc: Grzegorz Halat Cc: "Guilherme G. Piccoli" Cc: "Huang, Ying" Cc: Jason A. Donenfeld Cc: Joel Savitz Cc: Jonathan Corbet Cc: Kees Cook Cc: Laurent Dufour Cc: Luis Chamberlain Cc: Rob Herring Cc: Stephen Kitt Cc: Will Deacon Cc: Xiaoming Ni Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/kernel.rst | 1 + fs/coredump.c | 5 +++++ include/linux/coredump.h | 1 + 3 files changed, 7 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index bbaa851946956..0dde37d7fce25 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -174,6 +174,7 @@ core_pattern %f executable filename %E executable path %c maximum size of core file by resource limit RLIMIT_CORE + %C CPU the task ran on % both are dropped ======== ========================================== diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae2021093..a5827c5349619 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -325,6 +325,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* CPU the task ran on */ + case 'C': + err = cn_printf(cn, "%d", cprm->cpu); + break; default: break; } @@ -535,6 +539,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) */ .mm_flags = mm->flags, .vma_meta = NULL, + .cpu = raw_smp_processor_id(), }; audit_core_dumps(siginfo->si_signo); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 08a1d3e7e46d0..191dcf5af6cb9 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -22,6 +22,7 @@ struct coredump_params { struct file *file; unsigned long limit; unsigned long mm_flags; + int cpu; loff_t written; loff_t pos; loff_t to_skip; From 30da00eb2a3c68f59cbd07dcaef8dd43f4063b0c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 14:16:56 +0800 Subject: [PATCH 330/352] fs/ocfs2/suballoc.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905061656.1829179-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Acked-by: Joseph Qi Signed-off-by: Andrew Morton --- fs/ocfs2/suballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 5805a03d100ba..9c74eace3adc1 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -106,7 +106,7 @@ int ocfs2_claim_clusters(handle_t *handle, u32 *cluster_start, u32 *num_clusters); /* - * Use this variant of ocfs2_claim_clusters to specify a maxiumum + * Use this variant of ocfs2_claim_clusters to specify a maximum * number of clusters smaller than the allocation reserved. */ int __ocfs2_claim_clusters(handle_t *handle, From 116993970b50b20b0e8e8b31e6c80f0a163ee945 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Mon, 5 Sep 2022 10:10:34 +0800 Subject: [PATCH 331/352] init.h: fix spelling typo in comment Fix spelling typo in comment. Link: https://lkml.kernel.org/r/20220905021034.947701-1-13667453960@163.com Signed-off-by: Jiangshan Yi Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010a..3254766ebbf23 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -134,7 +134,7 @@ static inline initcall_t initcall_from_entry(initcall_entry_t *entry) extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; -/* Used for contructor calls. */ +/* Used for constructor calls. */ typedef void (*ctor_fn_t)(void); struct file_system_type; From ca9b4719a3d53b7b302dc9549f7e8393c1e66d32 Mon Sep 17 00:00:00 2001 From: Jingyu Wang Date: Fri, 9 Sep 2022 02:54:52 +0800 Subject: [PATCH 332/352] ipc: mqueue: remove unnecessary conditionals iput() already handles null and non-null parameters, so there is no need to use if(). Link: https://lkml.kernel.org/r/20220908185452.76590-1-jingyuwang_vip@163.com Signed-off-by: Jingyu Wang Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- ipc/mqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f98de32aeea17..9834104a5a31d 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -986,8 +986,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) out_unlock: inode_unlock(d_inode(mnt->mnt_root)); - if (inode) - iput(inode); + iput(inode); mnt_drop_write(mnt); out_name: putname(name); From d123f3fdb8f416e23586fa1f1a29cbeed4e6124f Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Fri, 9 Sep 2022 17:07:55 -0300 Subject: [PATCH 333/352] firmware: google: test spinlock on panic path to avoid lockups Currently the gsmi driver registers a panic notifier as well as reboot and die notifiers. The callbacks registered are called in atomic and very limited context - for instance, panic disables preemption and local IRQs, also all secondary CPUs (not executing the panic path) are shutdown. With that said, taking a spinlock in this scenario is a dangerous invitation for lockup scenarios. So, fix that by checking if the spinlock is free to acquire in the panic notifier callback - if not, bail-out and avoid a potential hang. Link: https://lkml.kernel.org/r/20220909200755.189679-1-gpiccoli@igalia.com Fixes: 74c5b31c6618 ("driver: Google EFI SMI") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Evan Green Cc: Ard Biesheuvel Cc: David Gow Cc: Greg Kroah-Hartman Cc: Julius Werner Cc: Petr Mladek Signed-off-by: Andrew Morton --- drivers/firmware/google/gsmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index adaa492c3d2df..4e2575dfeb908 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -681,6 +681,15 @@ static struct notifier_block gsmi_die_notifier = { static int gsmi_panic_callback(struct notifier_block *nb, unsigned long reason, void *arg) { + + /* + * Panic callbacks are executed with all other CPUs stopped, + * so we must not attempt to spin waiting for gsmi_dev.lock + * to be released. + */ + if (spin_is_locked(&gsmi_dev.lock)) + return NOTIFY_DONE; + gsmi_shutdown_reason(GSMI_SHUTDOWN_PANIC); return NOTIFY_DONE; } From 46aac981a582910851605e34d0ea37377f4fc6d1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Sep 2022 13:57:41 -0700 Subject: [PATCH 334/352] fs> uninline inode_maybe_inc_iversion() It has many callsites and is large. text data bss dec hex filename 91796 15984 512 108292 1a704 mm/shmem.o-before 91180 15984 512 107676 1a49c mm/shmem.o-after Acked-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/libfs.c | 46 ++++++++++++++++++++++++++++++++++++++++ include/linux/iversion.h | 46 +--------------------------------------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31d..682d56345a1cf 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include /* sync_mapping_buffers */ #include @@ -1520,3 +1521,48 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + do { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion); diff --git a/include/linux/iversion.h b/include/linux/iversion.h index eb5a158101693..e27bd4f55d840 100644 --- a/include/linux/iversion.h +++ b/include/linux/iversion.h @@ -172,51 +172,7 @@ inode_set_iversion_queried(struct inode *inode, u64 val) I_VERSION_QUERIED); } -/** - * inode_maybe_inc_iversion - increments i_version - * @inode: inode with the i_version that should be updated - * @force: increment the counter even if it's not necessary? - * - * Every time the inode is modified, the i_version field must be seen to have - * changed by any observer. - * - * If "force" is set or the QUERIED flag is set, then ensure that we increment - * the value, and clear the queried flag. - * - * In the common case where neither is set, then we can return "false" without - * updating i_version. - * - * If this function returns false, and no other metadata has changed, then we - * can avoid logging the metadata. - */ -static inline bool -inode_maybe_inc_iversion(struct inode *inode, bool force) -{ - u64 cur, new; - - /* - * The i_version field is not strictly ordered with any other inode - * information, but the legacy inode_inc_iversion code used a spinlock - * to serialize increments. - * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. - * - * This barrier pairs with the barrier in inode_query_iversion() - */ - smp_mb(); - cur = inode_peek_iversion_raw(inode); - do { - /* If flag is clear then we needn't do anything */ - if (!force && !(cur & I_VERSION_QUERIED)) - return false; - - /* Since lowest bit is flag, add 2 to avoid it */ - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); - return true; -} - +bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version From 3c07bfce92a500f9cb9eab40e3f5d99728f9991e Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 9 Sep 2022 14:25:29 +0200 Subject: [PATCH 335/352] proc: make config PROC_CHILDREN depend on PROC_FS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") introduces the config PROC_CHILDREN to configure kernels to provide the /proc//task//children file. When one deselects PROC_FS for kernel builds without /proc/, the config PROC_CHILDREN has no effect anymore, but is still visible in menuconfig. Add the dependency on PROC_FS to make the PROC_CHILDREN option disappear for kernel builds without /proc/. Link: https://lkml.kernel.org/r/20220909122529.1941-1-lukas.bulwahn@gmail.com Fixes: 2e13ba54a268 ("fs, proc: introduce CONFIG_PROC_CHILDREN") Signed-off-by: Lukas Bulwahn Cc: Iago López Galeiras Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index c930001056f95..32b1116ae137c 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -92,6 +92,7 @@ config PROC_PAGE_MONITOR config PROC_CHILDREN bool "Include /proc//task//children file" + depends on PROC_FS default n help Provides a fast way to retrieve first level children pids of a task. See From 60584ad8e4d2507dc903c7ad15c2a42f70e3d599 Mon Sep 17 00:00:00 2001 From: wuchi Date: Fri, 9 Sep 2022 18:10:25 +0800 Subject: [PATCH 336/352] relay: use kvcalloc to alloc page array in relay_alloc_page_array kvcalloc() is safer because it will check the integer overflows, and using it will simple the logic of allocation size. Link: https://lkml.kernel.org/r/20220909101025.82955-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Andrew Morton --- kernel/relay.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 6a611e779e958..d7edc934c56d5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); + return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); } /* From 1e44a566100dea889e5a03bcf2f238050dc221b2 Mon Sep 17 00:00:00 2001 From: wangjianli Date: Thu, 8 Sep 2022 21:00:36 +0800 Subject: [PATCH 337/352] fs/ocfs2: fix repeated words in comments Delete the redundant word 'to'. Link: https://lkml.kernel.org/r/20220908130036.31149-1-wangjianli@cdjrlc.com Signed-off-by: wangjianli Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b2ebaec86ddca..365025ab2726c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2614,7 +2614,7 @@ static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, } /* - * Calculate out the start and number of virtual clusters we need to to CoW. + * Calculate out the start and number of virtual clusters we need to CoW. * * cpos is vitual start cluster position we want to do CoW in a * file and write_len is the cluster length. From e4a722848b90898f7100414074292d555c9a25ff Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Mon, 12 Sep 2022 07:15:57 +0000 Subject: [PATCH 338/352] fork: remove duplicate included header files linux/sched/mm.h is included more than once. Link: https://lkml.kernel.org/r/20220912071556.16811-1-xu.panda@zte.com.cn Signed-off-by: Xu Panda Reported-by: Zeal Robot Cc: Andy Lutomirski Cc: Christian Brauner (Microsoft) Cc: "Eric W . Biederman" Cc: Fenghua Yu Cc: Liam Howlett Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 90c85b17bf698..3b8784e3ce97d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,7 +97,6 @@ #include #include #include -#include #include #include From 23ae6a822cb44235cca2551c22263708554a6439 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:37 +0800 Subject: [PATCH 339/352] percpu: Add percpu_counter_add_local and percpu_counter_sub_local Patch series "/msg: mitigate the lock contention in ipc/msg", v6. Here are two patches to mitigate the lock contention in ipc/msg. The 1st patch is to add the new interface percpu_counter_add_local and percpu_counter_sub_local. The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. The 2nd patch is to use percpu_counter instead of atomic update in ipc/msg. The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. This patch (of 2): The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Link: https://lkml.kernel.org/r/20220913192538.3023708-1-jiebin.sun@intel.com Link: https://lkml.kernel.org/r/20220913192538.3023708-2-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Davidlohr Bueso Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79d..8ed5fba6d156f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) preempt_enable(); } +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ From 509d162dbafe2d03ed751e033c4df665e340151c Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Wed, 14 Sep 2022 03:25:38 +0800 Subject: [PATCH 340/352] ipc/msg: mitigate the lock contention with percpu counter The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) Link: https://lkml.kernel.org/r/20220913192538.3023708-3-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Signed-off-by: Andrew Morton --- include/linux/ipc_namespace.h | 5 ++-- ipc/msg.c | 44 ++++++++++++++++++++++++----------- ipc/namespace.c | 5 +++- ipc/util.h | 4 ++-- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e3e8c8662b490..e8240cf2611ad 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include struct user_namespace; @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; diff --git a/ipc/msg.c b/ipc/msg.c index a0d05775af2c5..f2bb4c193ecf3 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs); + msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + + fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); + fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); diff --git a/ipc/namespace.c b/ipc/namespace.c index e1fcaedba4fae..8316ea5857333 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (!setup_ipc_sysctls(ns)) goto fail_mq; + err = msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); return ns; diff --git a/ipc/util.h b/ipc/util.h index 2dd7ce0416d8e..1b0086c6346f2 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } From 8adf62812d7ecfc71d96388dbffb9d84792401fc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 14:45:19 -0700 Subject: [PATCH 341/352] ipc-msg-mitigate-the-lock-contention-with-percpu-counter-checkpatch-fixes WARNING: labels should not be indented #158: FILE: ipc/msg.c:1319: + fail_msg_hdrs: WARNING: labels should not be indented #160: FILE: ipc/msg.c:1321: + fail_msg_bytes: ERROR: space required after that ';' (ctx:VxV) #203: FILE: ipc/util.h:75: +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} ^ total: 1 errors, 2 warnings, 144 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/ipc-msg-mitigate-the-lock-contention-with-percpu-counter.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Alexander Mikhalitsyn Cc: Alexey Gladkov Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: Dennis Zhou Cc: "Eric W . Biederman" Cc: Jiebin Sun Cc: Manfred Spraul Cc: Shakeel Butt Cc: Tejun Heo Cc: Tim Chen Cc: Vasily Averin Signed-off-by: Andrew Morton --- ipc/msg.c | 8 ++++---- ipc/util.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index f2bb4c193ecf3..9fb58de557fba 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1316,10 +1316,10 @@ int msg_init_ns(struct ipc_namespace *ns) ipc_init_ids(&ns->ids[IPC_MSG_IDS]); return 0; - fail_msg_hdrs: - percpu_counter_destroy(&ns->percpu_msg_bytes); - fail_msg_bytes: - return ret; +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS diff --git a/ipc/util.h b/ipc/util.h index 1b0086c6346f2..b2906e3665394 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } From 5dcabc54a52554f8014323ba3a8da14cb434d3d0 Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Tue, 20 Sep 2022 23:08:09 +0800 Subject: [PATCH 342/352] ipc/msg: avoid negative value by overflow in msginfo The 32-bit value in msginfo struct could be negative if we get it from signed 64-bit. Clamping it to INT_MAX helps to avoid the negative value by overflow. Link: https://lkml.kernel.org/r/20220920150809.4014944-1-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Reviewed-by: Manfred Spraul Reviewed-by: Tim Chen Signed-off-by: Andrew Morton --- ipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 9fb58de557fba..6aedde33a22d6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -501,8 +501,8 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { - msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs); - msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes); + msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); + msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; From 2772cda98ced82c033b587755f15a1b34bf2cf96 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 21 Sep 2022 16:54:33 -0700 Subject: [PATCH 343/352] a In file included from ./include/linux/kernel.h:26, from ./arch/x86/include/asm/percpu.h:27, from ./arch/x86/include/asm/preempt.h:6, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:55, from ./include/linux/ipc.h:5, from ./include/uapi/linux/msg.h:5, from ./include/linux/msg.h:6, from ipc/msg.c:27: ipc/msg.c: In function 'msgctl_info': ./include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast [-Werror] 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:18: note: in expansion of macro '__typecheck' 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~~~~~~~~~ ./include/linux/minmax.h:36:31: note: in expansion of macro '__safe_cmp' 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~~~~~~~~~ ./include/linux/minmax.h:45:25: note: in expansion of macro '__careful_cmp' 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ ipc/msg.c:504:35: note: in expansion of macro 'min' 504 | msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); | ^~~ ./include/linux/minmax.h:20:35: error: comparison of distinct pointer types lacks a cast [-Werror] 20 | (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1))) | ^~ ./include/linux/minmax.h:26:18: note: in expansion of macro '__typecheck' 26 | (__typecheck(x, y) && __no_side_effects(x, y)) | ^~~~~~~~~~~ ./include/linux/minmax.h:36:31: note: in expansion of macro '__safe_cmp' 36 | __builtin_choose_expr(__safe_cmp(x, y), \ | ^~~~~~~~~~ ./include/linux/minmax.h:45:25: note: in expansion of macro '__careful_cmp' 45 | #define min(x, y) __careful_cmp(x, y, <) | ^~~~~~~~~~~~~ ipc/msg.c:505:35: note: in expansion of macro 'min' 505 | msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); | ^~~ Signed-off-by: Andrew Morton --- ipc/msg.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 6aedde33a22d6..e4e0990e08f75 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -501,8 +501,12 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { - msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); - msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; From 756b8640c689f4a08e7fa332422fdbabfa105bd7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:16 +0900 Subject: [PATCH 344/352] libfs: add DEFINE_SIMPLE_ATTRIBUTE_SIGNED for signed value Patch series "fix error when writing negative value to simple attribute files". The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), but some attribute files want to accept a negative value. This patch (of 3): The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value. This adds DEFINE_SIMPLE_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-1-akinobu.mita@gmail.com Link: https://lkml.kernel.org/r/20220919172418.45257-2-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- fs/libfs.c | 22 +++++++++++++++++++--- include/linux/fs.h | 12 ++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 682d56345a1cf..aada4e7c87132 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -995,8 +995,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ -ssize_t simple_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; @@ -1017,7 +1017,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, goto out; attr->set_buf[size] = '\0'; - ret = kstrtoull(attr->set_buf, 0, &val); + if (is_signed) + ret = kstrtoll(attr->set_buf, 0, &val); + else + ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); @@ -1027,8 +1030,21 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, mutex_unlock(&attr->mutex); return ret; } + +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(simple_attr_write); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return simple_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(simple_attr_write_signed); + /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on diff --git a/include/linux/fs.h b/include/linux/fs.h index 9eced4cc286ee..c791388189223 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3471,7 +3471,7 @@ void simple_transaction_set(struct file *file, size_t n); * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ -#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -3482,10 +3482,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ - .write = simple_attr_write, \ + .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } +#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { @@ -3500,6 +3506,8 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); From 7f8cb45b40118d90c54f681628da28b14119913d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:17 +0900 Subject: [PATCH 345/352] lib/notifier-error-inject: fix error when writing -errno to debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"). This restores the previous behaviour by using newly introduced DEFINE_SIMPLE_ATTRIBUTE_SIGNED instead of DEFINE_SIMPLE_ATTRIBUTE. Link: https://lkml.kernel.org/r/20220919172418.45257-3-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- lib/notifier-error-inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c index 21016b32d3131..2b24ea6c94979 100644 --- a/lib/notifier-error-inject.c +++ b/lib/notifier-error-inject.c @@ -15,7 +15,7 @@ static int debugfs_errno_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set, +DEFINE_SIMPLE_ATTRIBUTE_SIGNED(fops_errno, debugfs_errno_get, debugfs_errno_set, "%lld\n"); static struct dentry *debugfs_create_errno(const char *name, umode_t mode, From e28b020aa9d5f374bd0bac71b99c0b452a199905 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 20 Sep 2022 02:24:18 +0900 Subject: [PATCH 346/352] debugfs: fix error when writing negative value to atomic_t debugfs file The simple attribute files do not accept a negative value since the commit 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()"), so we have to use a 64-bit value to write a negative value for a debugfs file created by debugfs_create_atomic_t(). This restores the previous behaviour by introducing DEFINE_DEBUGFS_ATTRIBUTE_SIGNED for a signed value. Link: https://lkml.kernel.org/r/20220919172418.45257-4-akinobu.mita@gmail.com Fixes: 488dac0c9237 ("libfs: fix error cast of negative value in simple_attr_write()") Signed-off-by: Akinobu Mita Reported-by: Zhao Gongyi Reviewed-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Cc: Alexander Viro Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Rafael J. Wysocki Cc: Shuah Khan Cc: Wei Yongjun Cc: Yicong Yang Signed-off-by: Andrew Morton --- .../fault-injection/fault-injection.rst | 10 +++---- fs/debugfs/file.c | 28 +++++++++++++++---- include/linux/debugfs.h | 19 +++++++++++-- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 17779a2772e51..5f6454b9dbd4d 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -83,9 +83,7 @@ configuration of fault-injection capabilities. - /sys/kernel/debug/fail*/times: specifies how many times failures may happen at most. A value of -1 - means "no limit". Note, though, that this file only accepts unsigned - values. So, if you want to specify -1, you better use 'printf' instead - of 'echo', e.g.: $ printf %#x -1 > times + means "no limit". - /sys/kernel/debug/fail*/space: @@ -284,7 +282,7 @@ Application Examples echo Y > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -338,7 +336,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 10 > /sys/kernel/debug/$FAILTYPE/probability echo 100 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 2 > /sys/kernel/debug/$FAILTYPE/verbose echo Y > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait @@ -369,7 +367,7 @@ Application Examples echo N > /sys/kernel/debug/$FAILTYPE/task-filter echo 100 > /sys/kernel/debug/$FAILTYPE/probability echo 0 > /sys/kernel/debug/$FAILTYPE/interval - printf %#x -1 > /sys/kernel/debug/$FAILTYPE/times + echo -1 > /sys/kernel/debug/$FAILTYPE/times echo 0 > /sys/kernel/debug/$FAILTYPE/space echo 1 > /sys/kernel/debug/$FAILTYPE/verbose diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 950c63fa4d0b2..38930d9b0bb73 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -378,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, } EXPORT_SYMBOL_GPL(debugfs_attr_read); -ssize_t debugfs_attr_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) +static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, + size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; @@ -387,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; - ret = simple_attr_write(file, buf, len, ppos); + if (is_signed) + ret = simple_attr_write_signed(file, buf, len, ppos); + else + ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, false); +} EXPORT_SYMBOL_GPL(debugfs_attr_write); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return debugfs_attr_write_xsigned(file, buf, len, ppos, true); +} +EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); + static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, @@ -738,11 +754,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, +DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index c869f1e73d755..6db34e9c50ec9 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -45,7 +45,7 @@ struct debugfs_u32_array { extern struct dentry *arch_debugfs_dir; -#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ +#define DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ @@ -56,10 +56,16 @@ static const struct file_operations __fops = { \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ - .write = debugfs_attr_write, \ + .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \ .llseek = no_llseek, \ } +#define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) + +#define DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ + DEFINE_DEBUGFS_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) + typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); #if defined(CONFIG_DEBUG_FS) @@ -100,6 +106,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); +ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, + size_t len, loff_t *ppos); struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, const char *new_name); @@ -248,6 +256,13 @@ static inline ssize_t debugfs_attr_write(struct file *file, return -ENODEV; } +static inline ssize_t debugfs_attr_write_signed(struct file *file, + const char __user *buf, + size_t len, loff_t *ppos) +{ + return -ENODEV; +} + static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *new_dir, char *new_name) { From 2fe614af589075d4bc6899b040a2eaa88cecd5e7 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 19 Sep 2022 09:44:06 +0800 Subject: [PATCH 347/352] usr/gen_init_cpio.c: remove unnecessary -1 values from int file The file variable is assigned first, it does not need to be initialized. Link: https://lkml.kernel.org/r/20220919014406.3242-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Li zeming Cc: Masahiro Yamada Cc: Nicolas Schier Signed-off-by: Andrew Morton --- usr/gen_init_cpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index dc838e26a5b9a..ee01e40e8bc65 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -326,7 +326,7 @@ static int cpio_mkfile(const char *name, const char *location, char s[256]; struct stat buf; unsigned long size; - int file = -1; + int file; int retval; int rc = -1; int namesize; From 3b0e73b2b82e95089aa5ab008f4cf5363f3c7d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 14 Sep 2022 12:02:55 +0200 Subject: [PATCH 348/352] checkpatch: warn for non-standard fixes tag style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a warning for fixes tags that does not follow community conventions. Link: https://lkml.kernel.org/r/20220914100255.1048460-1-niklas.soderlund@corigine.com Signed-off-by: Niklas Söderlund Reviewed-by: Simon Horman Reviewed-by: Louis Peens Reviewed-by: Philippe Schenker Acked-by: Dwaipayan Ray Reviewed-by: Lukas Bulwahn Acked-by: Lukas Bulwahn Acked-by: Joe Perches Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 7 ++++ scripts/checkpatch.pl | 44 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index b52452bc29636..c3389c6f38381 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -612,6 +612,13 @@ Commit message See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + **BAD_FIXES_TAG** + The Fixes: tag is malformed or does not follow the community conventions. + This can occur if the tag have been split into multiple lines (e.g., when + pasted in an email program with word wrapping enabled). + + See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + Comparison style ---------------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 18effbe1fe908..e8e0542f29f02 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3146,6 +3146,50 @@ sub process { } } +# Check Fixes: styles is correct + if (!$in_header_lines && + $line =~ /^\s*fixes:?\s*(?:commit\s*)?[0-9a-f]{5,}\b/i) { + my $orig_commit = ""; + my $id = "0123456789ab"; + my $title = "commit title"; + my $tag_case = 1; + my $tag_space = 1; + my $id_length = 1; + my $id_case = 1; + my $title_has_quotes = 0; + + if ($line =~ /(\s*fixes:?)\s+([0-9a-f]{5,})\s+($balanced_parens)/i) { + my $tag = $1; + $orig_commit = $2; + $title = $3; + + $tag_case = 0 if $tag eq "Fixes:"; + $tag_space = 0 if ($line =~ /^fixes:? [0-9a-f]{5,} ($balanced_parens)/i); + + $id_length = 0 if ($orig_commit =~ /^[0-9a-f]{12}$/i); + $id_case = 0 if ($orig_commit !~ /[A-F]/); + + # Always strip leading/trailing parens then double quotes if existing + $title = substr($title, 1, -1); + if ($title =~ /^".*"$/) { + $title = substr($title, 1, -1); + $title_has_quotes = 1; + } + } + + my ($cid, $ctitle) = git_commit_info($orig_commit, $id, + $title); + + if ($ctitle ne $title || $tag_case || $tag_space || + $id_length || $id_case || !$title_has_quotes) { + if (WARN("BAD_FIXES_TAG", + "Please use correct Fixes: style 'Fixes: <12 chars of sha1> (\"\")' - ie: 'Fixes: $cid (\"$ctitle\")'\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] = "Fixes: $cid (\"$ctitle\")"; + } + } + } + # Check email subject for common tools that don't need to be mentioned if ($in_header_lines && $line =~ /^Subject:.*\b(?:checkpatch|sparse|smatch)\b[^:]/i) { From 29ff8d7471c45a75a436682eb82090ef621086ad Mon Sep 17 00:00:00 2001 From: Minghao Chi <chi.minghao@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:02 +0900 Subject: [PATCH 349/352] nilfs2: delete unnecessary checks before brelse() Patch series "nilfs2 minor amendments". This patch (of 2): The brelse() inline function tests whether its argument is NULL and then returns immediately. Thus remove the tests which are not needed around the shown calls. Link: https://lkml.kernel.org/r/20220921034803.2476-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20220819081700.96279-1-chi.minghao@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-2-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/btree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 9f4d9432d38a1..b9d15c3df3cc1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1668,8 +1668,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? nilfs_btree_node_get_key(node, nchildren - 2) : 0; - if (bh != NULL) - brelse(bh); + brelse(bh); return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } @@ -1717,8 +1716,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *btree, ptrs[i] = le64_to_cpu(dptrs[i]); } - if (bh != NULL) - brelse(bh); + brelse(bh); return nitems; } From e87d0692e8004863eecee13ad7352e4dfb140322 Mon Sep 17 00:00:00 2001 From: ye xingchen <ye.xingchen@zte.com.cn> Date: Wed, 21 Sep 2022 12:48:03 +0900 Subject: [PATCH 350/352] nilfs2: Remove the unneeded result variable Return the value nilfs_segctor_sync() directly instead of storing it in another redundant variable. Link: https://lkml.kernel.org/r/20220831033403.302184-1-ye.xingchen@zte.com.cn Link: https://lkml.kernel.org/r/20220921034803.2476-3-konishi.ryusuke@gmail.com Reported-by: Zeal Robot <zealci@zte.com.cn> Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Minghao Chi <chi.minghao@zte.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/nilfs2/segment.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0afe0832c7547..9abae2c9120ed 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2235,7 +2235,6 @@ int nilfs_construct_segment(struct super_block *sb) struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_sc_info *sci = nilfs->ns_writer; struct nilfs_transaction_info *ti; - int err; if (!sci) return -EROFS; @@ -2243,8 +2242,7 @@ int nilfs_construct_segment(struct super_block *sb) /* A call inside transactions causes a deadlock. */ BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); - err = nilfs_segctor_sync(sci); - return err; + return nilfs_segctor_sync(sci); } /** From 8928c99cb2674aeffe3acf53c8e9925f9da87911 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan <adobriyan@gmail.com> Date: Tue, 20 Sep 2022 20:35:23 +0300 Subject: [PATCH 351/352] proc: mark more files as permanent Mark /proc/devices /proc/kpagecount /proc/kpageflags /proc/kpagecgroup /proc/loadavg /proc/meminfo /proc/softirqs /proc/uptime /proc/version as permanent /proc entries, saving alloc/free and some list/spinlock ops per use. These files are never removed by the kernel so it is OK to mark them. Link: https://lkml.kernel.org/r/Yyn527DzDMa+r0Yj@localhost.localdomain Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/proc/devices.c | 6 +++++- fs/proc/internal.h | 5 +++++ fs/proc/loadavg.c | 6 +++++- fs/proc/meminfo.c | 5 ++++- fs/proc/page.c | 3 +++ fs/proc/softirqs.c | 6 +++++- fs/proc/uptime.c | 6 +++++- fs/proc/version.c | 6 +++++- 8 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 837971e741097..fe7bfcb7d0494 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -4,6 +4,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/blkdev.h> +#include "internal.h" static int devinfo_show(struct seq_file *f, void *v) { @@ -54,7 +55,10 @@ static const struct seq_operations devinfo_ops = { static int __init proc_devices_init(void) { - proc_create_seq("devices", 0, NULL, &devinfo_ops); + struct proc_dir_entry *pde; + + pde = proc_create_seq("devices", 0, NULL, &devinfo_ops); + pde_make_permanent(pde); return 0; } fs_initcall(proc_devices_init); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d8..af277184b8073 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -79,6 +79,11 @@ static inline bool pde_is_permanent(const struct proc_dir_entry *pde) return pde->flags & PROC_ENTRY_PERMANENT; } +static inline void pde_make_permanent(struct proc_dir_entry *pde) +{ + pde->flags |= PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index f32878d9a39f3..817981e57223e 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/seqlock.h> #include <linux/time.h> +#include "internal.h" static int loadavg_proc_show(struct seq_file *m, void *v) { @@ -27,7 +28,10 @@ static int loadavg_proc_show(struct seq_file *m, void *v) static int __init proc_loadavg_init(void) { - proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("loadavg", 0, NULL, loadavg_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 6e89f0e2fd20f..70e5294052d52 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -162,7 +162,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) static int __init proc_meminfo_init(void) { - proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index a2873a617ae86..f2273b164535b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -91,6 +91,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecount_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; @@ -268,6 +269,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, } static const struct proc_ops kpageflags_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpageflags_read, }; @@ -322,6 +324,7 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, } static const struct proc_ops kpagecgroup_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecgroup_read, }; diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 12901dcf57e2b..f4616083faef3 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -3,6 +3,7 @@ #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include "internal.h" /* * /proc/softirqs ... display the number of softirqs @@ -27,7 +28,10 @@ static int show_softirqs(struct seq_file *p, void *v) static int __init proc_softirqs_init(void) { - proc_create_single("softirqs", 0, NULL, show_softirqs); + struct proc_dir_entry *pde; + + pde = proc_create_single("softirqs", 0, NULL, show_softirqs); + pde_make_permanent(pde); return 0; } fs_initcall(proc_softirqs_init); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index deb99bc9b7e6b..b5343d209381a 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/kernel_stat.h> +#include "internal.h" static int uptime_proc_show(struct seq_file *m, void *v) { @@ -39,7 +40,10 @@ static int uptime_proc_show(struct seq_file *m, void *v) static int __init proc_uptime_init(void) { - proc_create_single("uptime", 0, NULL, uptime_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("uptime", 0, NULL, uptime_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index b449f186577f8..02e3c3cd4a9af 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -5,6 +5,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include "internal.h" static int version_proc_show(struct seq_file *m, void *v) { @@ -17,7 +18,10 @@ static int version_proc_show(struct seq_file *m, void *v) static int __init proc_version_init(void) { - proc_create_single("version", 0, NULL, version_proc_show); + struct proc_dir_entry *pde; + + pde = proc_create_single("version", 0, NULL, version_proc_show); + pde_make_permanent(pde); return 0; } fs_initcall(proc_version_init); From 344f44d098da8f9694db36316b7a42d874796df7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Date: Wed, 21 Sep 2022 00:59:27 +0900 Subject: [PATCH 352/352] fs/ntfs3: fix negative shift size in true_sectors_per_clst() syzbot is reporting shift-out-of-bounds in true_sectors_per_clst() [1], for commit a3b774342fa752a5 ("fs/ntfs3: validate BOOT sectors_per_clusters") did not address that (0 - boot->sectors_per_clusters) < 0 because "u8" was chosen for type of boot->sectors_per_clusters because 0x80 needs to be positive in order to support 64K clusters. Use "s8" cast in order to make sure that (0 - (s8) boot->sectors_per_clusters) > 0. Link: https://syzkaller.appspot.com/bug?extid=1631f09646bc214d2e76 [1] Link: https://lkml.kernel.org/r/4b37f037-3b10-b4e4-0644-73441c8fa0af@I-love.SAKURA.ne.jp Fixes: a3b774342fa752a5 ("fs/ntfs3: validate BOOT sectors_per_clusters") Reported-by: syzbot <syzbot+1631f09646bc214d2e76@syzkaller.appspotmail.com> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Tested-by: syzbot <syzbot+1631f09646bc214d2e76@syzkaller.appspotmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> --- fs/ntfs3/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 47012c9bf505e..c7ffd21fb255d 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -672,7 +672,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ - return 1U << (0 - boot->sectors_per_clusters); + return 1U << (0 - (s8) boot->sectors_per_clusters); return -EINVAL; }